Changeset 1340:1c3b45fb0562

Show
Ignore:
Timestamp:
05/29/08 15:12:36 (7 months ago)
Author:
Greg Von Kuster <greg@bx.psu.edu>
branch:
default
convert_revision:
svn:9bcadc22-80f8-0310-8a53-c8f022958886/galaxy/trunk@2701
Message:

Requires config modification - Add Fastq sniffer, add support for fastqsolexa data type, rename convert_fatsq2fasta tool to be fastq_to_fasta_qual, add functional tests for both fastq and fastqsolexa data types, misc code cleanup.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • lib/galaxy/datatypes/converters/fastq_to_fasta_converter.py

    r1333 r1340  
    11#! /usr/bin/python 
    2  
    32""" 
    43convert fastq file to separated sequence and quality files. 
     
    2120assert sys.version_info[:2] >= ( 2, 4 ) 
    2221 
    23  
    2422def stop_err( msg ): 
    25      
    26     sys.stderr.write( "%s\n" % msg ) 
     23    sys.stderr.write( "%s" % msg ) 
    2724    sys.exit() 
    2825 
     26def __main__(): 
     27    infile_name = sys.argv[1] 
     28    outfile = open( sys.argv[2], 'w' ) 
     29    fastq_block_lines = 0 
     30    seq_title_startswith = '' 
    2931 
    30 if __name__ == '__main__': 
    31  
    32     # file I/O 
    33     infile = sys.argv[1] 
    34     outfile_seq = open(sys.argv[2], 'w') 
    35      
    36      
    37     # guessing the first char used in title lines 
    38     leading_char_seq_title = '' 
    39      
    40     every_four_lines = 0 
    41      
    42     for i, line in enumerate(file(infile)): 
    43          
    44         line = line.rstrip()    # get rid of the newline and spaces 
    45          
    46         if ((not line) or (line.startswith('#'))): continue               # comments 
    47          
    48         every_four_lines = (every_four_lines + 1) % 4 
    49         leading_char = line[0:1] 
    50          
    51         if every_four_lines == 1:   # first line is expected to be read title 
    52             if not leading_char_seq_title: 
    53                 leading_char_seq_title = leading_char 
    54             if leading_char != leading_char_seq_title: 
    55                 stop_err('Invalid fastq format at line %d.' %(i)) 
    56             read_title = line[1:] 
    57             outfile_seq.write('>%s\n' %(line[1:])) 
    58              
    59         elif every_four_lines == 2: # second line is expected to be read 
    60             read_length = len(line) 
    61             outfile_seq.write('%s\n' %(line)) 
    62          
     32    for i, line in enumerate( file( infile_name ) ): 
     33        line = line.rstrip() # eliminate trailing space and new line characters 
     34        if not line or line.startswith( '#' ): 
     35            continue 
     36        fastq_block_lines = ( fastq_block_lines + 1 ) % 4 
     37        line_startswith = line[0:1] 
     38        if fastq_block_lines == 1: 
     39            # line 1 is sequence title 
     40            if not seq_title_startswith: 
     41                seq_title_startswith = line_startswith 
     42            if seq_title_startswith != line_startswith: 
     43                stop_err( 'Invalid fastq format at line %d: %s.' %( i + 1, line ) ) 
     44            read_title = line[ 1: ] 
     45            outfile.write( '>%s\n' % line[1:] ) 
     46        elif fastq_block_lines == 2: 
     47            # line 2 is nucleotides 
     48            read_length = len( line ) 
     49            outfile.write( '%s\n' % line ) 
    6350        else: 
    6451            pass 
    6552 
    66     outfile_seq.close() 
     53    outfile.close() 
    6754 
    68      
    69      
     55if __name__ == "__main__": __main__()  
  • lib/galaxy/datatypes/converters/fastq_to_fasta_converter.xml

    r1333 r1340  
    1 <tool id="CONVERTER_fastq_to_fasta_0" name="FASTQ-to-FASTA" version="1.0.0"> 
    2   <description>converts FASTQ file to FASTA format</description> 
     1<tool id="CONVERTER_fastq_to_fasta_0" name="Convert Fastq to Fasta" version="1.0.0"> 
     2  <description>converts Fastq file to Fasta format</description> 
    33  <command interpreter="python">fastq_to_fasta_converter.py $input $output</command> 
    44  <inputs> 
    5     <param name="input" type="data" format="fastq" label="Fastq file"/> 
     5    <param name="input" type="data" format="fastq" label="Choose Fastq file"/> 
    66  </inputs> 
    77  <outputs> 
  • lib/galaxy/datatypes/converters/fastq_to_qual_converter.py

    r1333 r1340  
    11#! /usr/bin/python 
    2  
    32""" 
    43convert fastq file to separated sequence and quality files. 
     
    1514%python convert_fastq2fasta.py <your_fastq_filename> <output_seq_filename> <output_score_filename> 
    1615""" 
    17  
    1816import sys, os 
    1917from math import * 
     
    2119assert sys.version_info[:2] >= ( 2, 4 ) 
    2220 
    23  
    2421def stop_err( msg ): 
    25      
    26     sys.stderr.write( "%s\n" % msg ) 
     22    sys.stderr.write( "%s" % msg ) 
    2723    sys.exit() 
    2824 
    29  
    30 if __name__ == '__main__': 
    31  
    32     # file I/O 
    33     infile = sys.argv[1] 
    34     outfile_score = open(sys.argv[2], 'w')     
     25def __main__(): 
     26    infile_name = sys.argv[1] 
     27    outfile_score = open( sys.argv[2], 'w' )     
     28    qual_title_startswith = '' 
     29    seq_title_startswith = '' 
     30    default_coding_value = 64 
     31    fastq_block_lines = 0 
    3532     
    36     # guessing the first char used in title lines 
    37     leading_char_quality_title = '' 
    38     leading_char_seq_title = '' 
    39     default_coding_value = 64 
    40      
    41     every_four_lines = 0 
    42      
    43     for i, line in enumerate(file(infile)): 
    44          
    45         line = line.rstrip()    # get rid of the newline and spaces 
    46          
    47         if ((not line) or (line.startswith('#'))): continue               # comments 
    48          
    49         every_four_lines = (every_four_lines + 1) % 4 
    50         leading_char = line[0:1] 
    51          
    52         if every_four_lines == 1:   # first line is expected to be read title 
    53             if not leading_char_seq_title: 
    54                 leading_char_seq_title = leading_char 
    55             if leading_char != leading_char_seq_title: 
    56                 stop_err('Invalid fastq format at line %d.' %(i)) 
     33    for i, line in enumerate( file( infile_name ) ): 
     34        line = line.rstrip() 
     35        if not line or line.startswith( '#' ): 
     36            continue 
     37        fastq_block_lines = ( fastq_block_lines + 1 ) % 4 
     38        line_startswith = line[0:1] 
     39        if fastq_block_lines == 1: 
     40            # first line is @title_of_seq 
     41            if not seq_title_startswith: 
     42                seq_title_startswith = line_startswith 
     43            if line_startswith != seq_title_startswith: 
     44                stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) ) 
    5745            read_title = line[1:] 
    58              
    59         elif every_four_lines == 2: # second line is expected to be read 
    60             read_length = len(line
    61              
    62         elif every_four_lines == 3: # third line is expected to be quality title 
    63             if not leading_char_quality_title
    64                 leading_char_quality_title = leading_char 
    65             if leading_char != leading_char_quality_title
    66                 stop_err('Invalid fastq format at line %d.' %(i))     
     46        elif fastq_block_lines == 2: 
     47            # second line is nucleotides 
     48            read_length = len( line
     49        elif fastq_block_lines == 3: 
     50            # third line is +title_of_qualityscore (might be skipped) 
     51            if not qual_title_startswith
     52                qual_title_startswith = line_startswith 
     53            if line_startswith != qual_title_startswith
     54                stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) )     
    6755            quality_title = line[1:] 
    68              
    69             if (quality_title and (read_title != quality_title)): 
    70                 stop_err('Invalid fastq format: titles for sequence and quality score are different.') 
    71  
     56            if quality_title and read_title != quality_title: 
     57                stop_err( 'Invalid fastq format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) 
    7258            if not quality_title: 
    73                 outfile_score.write('>%s\n' %(read_title)
     59                outfile_score.write( '>%s\n' % read_title
    7460            else: 
    75                 outfile_score.write('>%s\n' %(line[1:])
    76                  
    77         else:   # fourth line is expected to be the ASCII-coded quality scores  
     61                outfile_score.write( '>%s\n' % line[1:]
     62        else: 
     63            # fourth line is quality scores 
    7864            qual = '' 
    79              
    80             # peek: ascii code or digits? 
    81             first_value = line.split()[0] 
    82              
    83             if first_value.isdigit(): 
     65            # peek: ascii or digits? 
     66            val = line.split()[0] 
     67            if val.isdigit(): 
    8468                # digits 
    8569                qual = line 
    8670            else: 
    87                 # ascii code 
    88                 # guess leading char 
    89                 quality_score_length = len(line) 
    90                 if quality_score_length == (read_length+1): # first char is leading_char_score 
    91                     leading_char_score = ord(line[0:1]) 
     71                # ascii 
     72                quality_score_length = len( line ) 
     73                if quality_score_length == read_length + 1: 
     74                    quality_score_startswith = ord( line[0:1] ) 
    9275                    line = line[1:] 
    9376                elif quality_score_length == read_length: 
    94                     leading_char_score = default_coding_value                 # default 
     77                    quality_score_startswith = default_coding_value 
    9578                else: 
    96                     stop_err('Invalid fastq format: the number of quality scores is not the same as bases.') 
    97                          
    98                 for j, char in enumerate(line): 
    99                     score = ord(char)-leading_char_score    # 64 
    100                     qual += (str(score) + ' ') 
    101                      
    102             outfile_score.write('%s\n' %(qual)) 
     79                    stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 
     80                for j, char in enumerate( line ): 
     81                    score = ord( char ) - quality_score_startswith    # 64 
     82                    qual = "%s%s " % ( qual, str( score ) ) 
     83            outfile_score.write( '%s\n' % qual ) 
    10384                             
    10485    outfile_score.close() 
     86 
     87if __name__ == "__main__": __main__()   
    10588     
    106      
  • lib/galaxy/datatypes/converters/fastq_to_qual_converter.xml

    r1333 r1340  
    1 <tool id="CONVERTER_fastq_to_qual_0" name="FASTQ-to-FASTA"> 
    2   <command interpreter="python">fastq_to_qual_converter.py $input1 $output1 </command> 
     1<tool id="CONVERTER_fastq_to_qual_0" name="Convert Fastq to Qual"> 
     2  <command interpreter="python">fastq_to_qual_converter.py $input1 $output1</command> 
    33  <inputs> 
    4     <param format="fastq" name="input1" type="data" label="Fastq file"/> 
     4    <param format="fastq" name="input1" type="data" label="Choose Fastq file"/> 
    55  </inputs> 
    66  <outputs> 
  • lib/galaxy/datatypes/sequence.py

    r1328 r1340  
    55import data 
    66import logging 
     7import re 
    78from cgi import escape 
    89from galaxy.datatypes.metadata import MetadataElement 
     
    9091 
    9192class Fastq( Sequence ): 
    92     """Class representing a FASTQ sequence""" 
    93     # FASTQ format stores sequences and Phred qualities in a single file. It is concise and compact.  
    94     # FASTQ is first widely used in the Sanger Institute and therefore we usually take the Sanger  
    95     # specification and the standard FASTQ format, or simply FASTQ format. Although Solexa/Illumina  
    96     # read file looks pretty much like FASTQ, they are different in that the qualities are scaled  
    97     # differently. In the quality string, if you can see a character with its ASCII code higher than  
    98     # 90, probably your file is in the Solexa/Illumina format. 
    99     # 
    100     # For details, see http://maq.sourceforge.net/fastq.shtml 
     93    """Class representing a FASTQ sequence ( the Sanger/Standard variant )""" 
    10194    file_ext = "fastq" 
    10295 
    10396    def set_peek( self, dataset ): 
    10497        Sequence.set_peek( self, dataset ) 
    105         sequences = 0 
    106         scores = 0 
     98        count = 0 
     99        size = 0 
     100        bases_regexp = re.compile("^[NGTAC]*$") 
    107101        for line in file( dataset.file_name ): 
    108             if line: 
    109                 if line.startswith( "@" ): 
    110                     sequences += 1 
    111                 elif line.startswith( '+' ): 
    112                     scores += 1 
    113         dataset.blurb = '%d sequences, %d quality scores' % ( sequences, scores ) 
     102            if line and line.startswith( ">" ): 
     103                count += 1 
     104            elif bases_regexp.match( line ): 
     105                line = line.strip() 
     106                size += len( line ) 
     107        if count == 1: 
     108            dataset.blurb = '%d bases' % size 
     109        else: 
     110            dataset.blurb = '%d sequences' % count 
     111 
     112    def sniff(self, filename): 
     113        """ 
     114        Determines whether the file is in fastq format ( the Sanger/Standard variant ) 
     115        For details, see http://maq.sourceforge.net/fastq.shtml 
     116 
     117        Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa 
     118              These differ in the representation of the quality scores 
     119 
     120        >>> fname = get_test_fname( '1.fastq' ) 
     121        >>> Fastq().sniff( fname ) 
     122        True 
     123        >>> fname = get_test_fname( '1.fastqsolexa' ) 
     124        >>> Fastq().sniff( fname ) 
     125        False 
     126        """ 
     127        headers = get_headers( filename, None ) 
     128        bases_regexp = re.compile( "^[NGTAC]*$" ) 
     129        try: 
     130            if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0] and headers[3][0]: 
     131                # Check the sequence line, make sure it contains only G/C/A/T/N 
     132                if not bases_regexp.match( headers[1][0] ): 
     133                    return False 
     134                # The quality score line 
     135                qscore = headers[3][0] 
     136                # In Standard/Sanger format, the quality score is a single string, whose length should be equal to the length of the sequence 
     137                if len( qscore ) != len( headers[1][0] ): 
     138                    return False  
     139                #Check the quality score values - in Sanger/Standard these should be ASCII characters between "!" (0x21) and "~" (0x7E) 
     140                for x in qscore: 
     141                    if ord( x ) < 0x21 or ord( x ) > 0x7e: 
     142                        return False 
     143                return True 
     144            return False 
     145        except: 
     146            return False 
     147 
     148class FastqSolexa( Sequence ): 
     149    """Class representing a FASTQ sequence ( the Solexa variant )""" 
     150    file_ext = "fastqsolexa" 
     151 
     152    def set_peek( self, dataset ): 
     153        Sequence.set_peek( self, dataset ) 
     154        count = size = 0 
     155        bases_regexp = re.compile("^[NGTAC]*$") 
     156        for line in file( dataset.file_name ): 
     157            if line and line[0] == ">": 
     158                count += 1 
     159            elif bases_regexp.match(line): 
     160                line = line.strip() 
     161                size += len(line) 
     162        if count == 1: 
     163            dataset.blurb = '%d bases' % size 
     164        else: 
     165            dataset.blurb = '%d sequences' % count 
     166 
     167    def sniff( self, filename ): 
     168        """ 
     169        Determines whether the file is in fastq format (Solexa Variant) 
     170        For details, see http://maq.sourceforge.net/fastq.shtml 
     171 
     172        Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa 
     173              These differ in the representation of the quality scores 
     174 
     175        >>> fname = get_test_fname( '1.fastq' ) 
     176        >>> SolexaFastq().sniff( fname ) 
     177        False 
     178        >>> fname = get_test_fname( '1.fastqsolexa' ) 
     179        >>> SolexaFastq().sniff( fname ) 
     180        True 
     181        """ 
     182        headers = get_headers( filename, None ) 
     183        bases_regexp = re.compile( "^[NGTAC]*$" ) 
     184        try: 
     185            if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: 
     186                # Check the sequence line, make sure it contains only G/C/A/T/N 
     187                if not bases_regexp.match( headers[1][0] ): 
     188                    return False 
     189                qscore = headers[3] 
     190                # In Solexa format, the quality score is a list of numbers, whose length should be equal to the length of the sequence 
     191                if len( qscore ) != len( headers[1][0] ): 
     192                    return False 
     193                # Check the quality score values - in Solexa/FASTQ these should be valid decimal numbers 
     194                # (if "x" is not a valid number, "int" will raise an exception) 
     195                for x in qscore: 
     196                    try: 
     197                        check = int( x ) 
     198                    except: 
     199                        return False 
     200                return True  
     201            return False 
     202        except: 
     203            return False 
    114204 
    115205try: 
     206    from galaxy import eggs 
    116207    import pkg_resources; pkg_resources.require( "bx-python" ) 
    117208    import bx.align.maf 
     
    280371        except: 
    281372            return False 
    282  
    283  
  • test/functional/test_sniffing_and_metadata_settings.py

    r661 r1340  
    3535        self.check_history_for_string('1.fasta format: <span class="fasta">fasta</span>, database: \? Info: uploaded file') 
    3636        self.check_metadata_for_string('value="1.fasta" value="\?" Change data type selected value="fasta" selected="yes"') 
     37        self.delete_history_item( 1 ) 
     38    def test_17_fastq_datatype( self ): 
     39        """Testing correctly sniffing fastq ( the Sanger/Standard variant ) data type upon upload""" 
     40        self.upload_file('1.fastq') 
     41        self.verify_dataset_correctness('1.fastq') 
     42        self.check_history_for_string('1.fastq format: <span class="fastq">fastq</span>, database: \? Info: uploaded fastq file') 
     43        self.delete_history_item( 1 ) 
     44    def test_18_fastq_datatype( self ): 
     45        """Testing correctly sniffing fastq ( the Solexa variant ) data type upon upload""" 
     46        self.upload_file('1.fastqsolexa') 
     47        self.verify_dataset_correctness('1.fastqsolexa') 
     48        self.check_history_for_string('1.fastqsolexa format: <span class="fastqsolexa">fastqsolexa</span>, database: \? Info: uploaded fastqsolexa file') 
    3749        self.delete_history_item( 1 ) 
    3850    def test_20_gff_datatype( self ): 
  • tool_conf.xml.sample

    r1328 r1340  
    6060    <tool file="filters/bed2gff.xml" /> 
    6161    <tool file="fasta_tools/fasta_to_tabular.xml" /> 
    62     <tool file="metag_tools/convert_fastq2fasta.xml" /> 
     62    <tool file="metag_tools/fastq_to_fasta_qual.xml" /> 
    6363    <tool file="filters/gff2bed.xml" /> 
    6464    <tool file="filters/lav_to_bed.xml" /> 
  • tools/data_source/upload.xml

    r1338 r1340  
    2626**Auto-detect** 
    2727 
    28 The system will attempt to detect AXT, FASTA, Gff, HTML, LAV, Maf, Tabular, Wiggle, BED and Interval (BED with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on different rows). You can still coerce the system to set your data to the format you think it should be (please send us a note if you see a case when a valid format is not detected).  You can also upload valid files that are compressed (gzipped), which will automatically be decompressed upon upload.  
     28The system will attempt to detect Axt, Fasta, Fastq, Gff, Gff3, Html, Lav, Maf, Tabular, Wiggle, Bed and Interval (Bed with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on different rows). You can still coerce the system to set your data to the format you think it should be.  You can also upload compressed files, which will automatically be decompressed.  
    2929 
    3030----- 
     
    3636----- 
    3737 
    38 **AXT** 
     38**Axt** 
    3939 
    4040blastz pairwise alignment format.  Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.  Blocks are separated from one another by blank lines.  The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields. 
     
    4848----- 
    4949 
    50 **BED** 
     50**Bed** 
    5151 
    5252* Tab delimited format (tabular) 
     
    7777----- 
    7878 
    79 **FASTA** 
     79**Fasta** 
    8080 
    8181A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.  The first character of the description line is a greater-than (">") symbol in the first column.  All lines should be shorter than 80 charcters:: 
     
    8787    tttcgtgcgtatag 
    8888    tggcgcggtga 
     89 
     90----- 
     91 
     92**Fastq** 
     93 
     94Fastq format stores sequences and Phred qualities in a single file. We define Fastq as the Sanger/Standard variant:: 
     95 
     96        @EAS54_6_R1_2_1_413_324 
     97        CCCTTCTTGTCTTCAGCGTTTCTCC 
     98        + 
     99        ;;3;;;;;;;;;;;;7;;;;;;;88 
     100        @EAS54_6_R1_2_1_540_792 
     101        TTGGCAGGCCAAGGCCGATGGATCA 
     102        + 
     103        ;;;;;;;;;;;7;;;;;-;;;3;83 
     104        @EAS54_6_R1_2_1_443_348 
     105        GTTGCTTCTGGCGTGGGTGGGGGGG 
     106        +EAS54_6_R1_2_1_443_348 
     107        ;;;;;;;;;;;9;7;;.7;393333 
    89108 
    90109----- 
  • tools/metag_tools/fastq_to_fasta_qual.py

    r1330 r1340  
    2121assert sys.version_info[:2] >= ( 2, 4 ) 
    2222 
    23  
    2423def stop_err( msg ): 
    25      
    26     sys.stderr.write( "%s\n" % msg ) 
     24    sys.stderr.write( "%s" % msg ) 
    2725    sys.exit() 
    2826 
    29  
    30 if __name__ == '__main__': 
    31  
    32     # file I/O 
    33     infile = sys.argv[1] 
    34     outfile_seq = open(sys.argv[2], 'w') 
    35     outfile_score = open(sys.argv[3], 'w')     
     27def __main__(): 
     28    infile_name = sys.argv[1] 
     29    outfile_seq = open( sys.argv[2], 'w' ) 
     30    outfile_score = open( sys.argv[3], 'w' )     
     31    seq_title_startswith = '' 
     32    qual_title_startswith = '' 
     33    default_coding_value = 64 
     34    fastq_block_lines = 0 
    3635     
    37     # guessing the first char used in title lines 
    38     leading_char_seq_title = '' 
    39     leading_char_quality_title = '' 
    40     default_coding_value = 64 
    41      
    42     every_four_lines = 0 
    43      
    44     for i, line in enumerate(file(infile)): 
    45          
    46         line = line.rstrip()    # get rid of the newline and spaces 
    47          
    48         if ((not line) or (line.startswith('#'))): continue               # comments 
    49          
    50         every_four_lines = (every_four_lines + 1) % 4 
    51         leading_char = line[0:1] 
    52          
    53         if every_four_lines == 1:   # first line is expected to be read title 
    54             if not leading_char_seq_title: 
    55                 leading_char_seq_title = leading_char 
    56             if leading_char != leading_char_seq_title: 
    57                 stop_err('Invalid fastq format at line %d.' %(i)) 
     36    for i, line in enumerate( file( infile_name ) ): 
     37        line = line.rstrip() 
     38        if not line or line.startswith( '#' ): 
     39            continue 
     40        fastq_block_lines = ( fastq_block_lines + 1 ) % 4 
     41        line_startswith = line[0:1] 
     42        if fastq_block_lines == 1: 
     43            # first line is @title_of_seq 
     44            if not seq_title_startswith: 
     45                seq_title_startswith = line_startswith 
     46            if line_startswith != seq_title_startswith: 
     47                stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) ) 
    5848            read_title = line[1:] 
    59             outfile_seq.write('>%s\n' %(line[1:])
    60              
    61         elif every_four_lines == 2: # second line is expected to be read 
    62             read_length = len(line
    63             outfile_seq.write('%s\n' %(line)
    64              
    65         elif every_four_lines == 3: # third line is expected to be quality title 
    66             if not leading_char_quality_title
    67                 leading_char_quality_title = leading_char 
    68             if leading_char != leading_char_quality_title
    69                 stop_err('Invalid fastq format at line %d.' %(i))     
     49            outfile_seq.write( '>%s\n' % line[1:]
     50        elif fastq_block_lines == 2: 
     51            # second line is nucleotides 
     52            read_length = len( line
     53            outfile_seq.write( '%s\n' % line
     54        elif fastq_block_lines == 3: 
     55            # third line is +title_of_qualityscore ( might be skipped ) 
     56            if not qual_title_startswith
     57                qual_title_startswith = line_startswith 
     58            if line_startswith != qual_title_startswith
     59                stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) )     
    7060            quality_title = line[1:] 
    71              
    72             if (quality_title and (read_title != quality_title)): 
    73                 stop_err('Invalid fastq format: titles for sequence and quality score are different.') 
    74  
     61            if quality_title and read_title != quality_title: 
     62                stop_err( 'Invalid fastq format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) 
    7563            if not quality_title: 
    76                 outfile_score.write('>%s\n' %(read_title)
     64                outfile_score.write( '>%s\n' % read_title
    7765            else: 
    78                 outfile_score.write('>%s\n' %(line[1:])
    79                  
    80         else:   # fourth line is expected to be the ASCII-coded quality scores  
     66                outfile_score.write( '>%s\n' % line[1:]
     67        else: 
     68            # fourth line is quality scores 
    8169            qual = '' 
    82              
    83             # peek: ascii code or digits? 
    84             first_value = line.split()[0] 
    85              
    86             if first_value.isdigit(): 
     70            # peek: ascii or digits? 
     71            val = line.split()[0] 
     72            if val.isdigit(): 
    8773                # digits 
    8874                qual = line 
    8975            else: 
    90                 # ascii code 
    91                 # guess leading char 
    92                 quality_score_length = len(line) 
    93                 if quality_score_length == (read_length+1): # first char is leading_char_score 
    94                     leading_char_score = ord(line[0:1]
     76                # ascii 
     77                quality_score_length = len( line ) 
     78                if quality_score_length == read_length + 1: 
     79                    # first char is qual_score_startswith 
     80                    qual_score_startswith = ord( line[0:1]
    9581                    line = line[1:] 
    9682                elif quality_score_length == read_length: 
    97                     leading_char_score = default_coding_value                 # default 
     83                    qual_score_startswith = default_coding_value 
    9884                else: 
    99                     stop_err('Invalid fastq format: the number of quality scores is not the same as bases.') 
    100                          
    101                 for j, char in enumerate(line): 
    102                     score = ord(char)-leading_char_score    # 64 
    103                     qual += (str(score) + ' ') 
    104                      
    105             outfile_score.write('%s\n' %(qual)) 
    106                              
     85                    stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 
     86                for j, char in enumerate( line ): 
     87                    score = ord( char ) - qual_score_startswith    # 64 
     88                    qual = "%s%s " % ( qual, str( score ) ) 
     89            outfile_score.write( '%s\n' % qual ) 
     90               
    10791    outfile_seq.close() 
    10892    outfile_score.close() 
     93 
     94if __name__ == "__main__": __main__()  
    10995     
    110      
  • tools/metag_tools/fastq_to_fasta_qual.xml

    r1336 r1340  
    1 <tool id="convert_fastq2fasta" name="FASTQ-to-FASTA" version="1.0.0"> 
    2   <description>converts FASTQ file to FASTA format</description> 
    3   <command interpreter="python">convert_fastq2fasta.py $input1 $output1 $output2</command> 
     1<tool id="fastq_to_fasta_qual" name="FASTQ-to-FASTA-QUAL" version="1.0.0"> 
     2  <description>extracts sequences and quality scores from FASTQ data</description> 
     3  <command interpreter="python">fastq_to_fasta_qual.py $input1 $output1 $output2</command> 
    44  <inputs> 
    5     <param name="input1" type="data" format="fastq" label="Fastq file"/> 
     5    <param name="input1" type="data" format="fastq,fastqsolexa" label="Fastq file"/> 
    66  </inputs> 
    77  <outputs> 
     
    1313    <test> 
    1414      <param name="input1" value="1.fastq" ftype="fastq" /> 
    15       <output name="output1" file="convert_fastq2fasta_out2.fasta" /> 
     15      <output name="output1" file="fastq_to_fasta_qual_out2.fasta" /> 
     16    </test> 
     17    <test> 
     18      <param name="input1" value="1.fastqsolexa" ftype="fastq" /> 
     19      <output name="output1" file="fastq_to_fasta_qual_out4.fasta" /> 
    1620    </test> 
    1721  </tests> 
     
    2024**What it does** 
    2125 
    22 This tool converts Solexa FASTQ data to FASTA format by generating 2 files, reads and quality scores.    
     26This tool extracts sequences and quality scores from FASTQ data ( both Sanger/Standard and Solexa variants ), producing a FASTA dataset and a QUAL dataset.    
    2327 
    2428----- 
     
    2630**Example1** 
    2731 
    28 - Converting the following Solexa fastq data:: 
     32- Converting the following Sanger/Standard fastq data:: 
    2933 
    3034    @seq1   
     
    5155    40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15  
    5256 
    53  
    5457**Example2** 
    5558 
    5659- Converting the following Solexa fastq data:: 
    5760 
    58     @seq1 
     61    @HANNIBAL_1_FC302VTAAXX:2:1:228:167 
    5962    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT 
    60     +seq1 
     63    +HANNIBAL_1_FC302VTAAXX:2:1:228:167 
    6164    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 
    62     @seq2 
     65    @HANNIBAL_1_FC302VTAAXX:2:1:156:340 
    6366    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG 
    64     +seq2 
     67    +HANNIBAL_1_FC302VTAAXX:2:1:156:340 
    6568    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 
    6669 
    6770- will extract the following sequences:: 
    6871 
    69     &gt;seq1 
     72    >HANNIBAL_1_FC302VTAAXX:2:1:228:167 
    7073    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT 
    71     &gt;seq2 
     74    >HANNIBAL_1_FC302VTAAXX:2:1:156:340 
    7275    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG 
    7376 
    7477- and quality scores:: 
    7578 
    76     &gt;seq1 
    77     40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4  
    78     &gt;seq2 
    79     40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9  
     79    >HANNIBAL_1_FC302VTAAXX:2:1:228:167 
     80    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 
     81    >HANNIBAL_1_FC302VTAAXX:2:1:156:340 
     82    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 
    8083 
    8184    </help> 
  • universe_wsgi.ini.sample

    r1334 r1340  
    177177fasta = galaxy.datatypes.sequence:Fasta,display_in_upload 
    178178fastq = galaxy.datatypes.sequence:Fastq,display_in_upload 
     179fastqsolexa = galaxy.datatypes.sequence:FastqSolexa,display_in_upload 
    179180gff = galaxy.datatypes.interval:Gff,display_in_upload 
    180181gff3 = galaxy.datatypes.interval:Gff3,display_in_upload 
     
    30130210 = galaxy.datatypes.sequence:Lav 
    30230315 = galaxy.datatypes.sequence:Fasta 
    303 20 = galaxy.datatypes.interval:Wiggle 
    304 25 = galaxy.datatypes.images:Html 
    305 30 = galaxy.datatypes.sequence:Axt 
    306 35 = galaxy.datatypes.interval:Bed 
    307 40 = galaxy.datatypes.interval:CustomTrack 
    308 45 = galaxy.datatypes.interval:Gff 
    309 50 = galaxy.datatypes.interval:Gff3 
    310 55 = galaxy.datatypes.interval:Interval 
     30420 = galaxy.datatypes.sequence:Fastq 
     30525 = galaxy.datatypes.sequence:FastqSolexa 
     30630 = galaxy.datatypes.interval:Wiggle 
     30735 = galaxy.datatypes.images:Html 
     30840 = galaxy.datatypes.sequence:Axt 
     30945 = galaxy.datatypes.interval:Bed 
     31050 = galaxy.datatypes.interval:CustomTrack 
     31155 = galaxy.datatypes.interval:Gff 
     31260 = galaxy.datatypes.interval:Gff3 
     31365 = galaxy.datatypes.interval:Interval