Changeset 1367:2328b62c0846

Show
Ignore:
Timestamp:
06/06/08 14:52:11 (7 months ago)
Author:
Wen-Yu Chung <wychung@bx.psu.edu>
branch:
default
convert_revision:
svn:9bcadc22-80f8-0310-8a53-c8f022958886/galaxy/trunk@2728
Message:

Update fastq format.
Now we only support FastqSolexa? variants.
If the quality scores are presented as characters,
the integer values are obtained by their ascii code subtract 64.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • datatype_converters_conf.xml.sample

    r1366 r1367  
    33  <converter file="bed_to_gff_converter.xml" source_datatype="bed" target_datatype="gff"/> 
    44  <converter file="fasta_to_tabular_converter.xml" source_datatype="fasta" target_datatype="tabular"/> 
    5   <converter file="fastq_to_fasta_converter.xml" source_datatype="fastq,fastqsolexa" target_datatype="fasta"/> 
    6   <converter file="fastq_to_qual_converter.xml" source_datatype="fastq,fastqsolexa" target_datatype="qual"/> 
     5  <converter file="fastq_to_fasta_converter.xml" source_datatype="fastqsolexa" target_datatype="fasta"/> 
     6  <converter file="fastq_to_qual_converter.xml" source_datatype="fastqsolexa" target_datatype="qual"/> 
    77  <converter file="gff_to_bed_converter.xml" source_datatype="gff" target_datatype="bed"/> 
    88  <converter file="interval_to_bed_converter.xml" source_datatype="interval" target_datatype="bed"/> 
  • lib/galaxy/datatypes/converters/fastq_to_fasta_converter.xml

    r1366 r1367  
    33  <command interpreter="python">fastq_to_fasta_converter.py $input $output</command> 
    44  <inputs> 
    5     <param name="input" type="data" format="fastq,fastqsolexa" label="Choose Fastq file"/> 
     5    <param name="input" type="data" format="fastqsolexa" label="Choose Fastq file"/> 
    66  </inputs> 
    77  <outputs> 
  • lib/galaxy/datatypes/converters/fastq_to_qual_converter.py

    r1366 r1367  
    2929    qual_title_startswith = '' 
    3030    seq_title_startswith = '' 
    31     default_coding_value = 33 
     31    default_coding_value = 64 
    3232    fastq_block_lines = 0 
    3333     
     
    7676            if fastq_integer: # digits 
    7777                qual = line 
    78             else: # ascii 
    79                 if datatype == 'fastqsolexa': 
    80                     outfile_score.close() 
    81                     stop_err( "This tool currently only works with the fastq solexa variant if the socres are integers, not ascii." ) 
     78            else:  
     79                # ascii 
    8280                quality_score_length = len( line ) 
    8381                if quality_score_length == read_length + 1: 
     
    8987                    stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 
    9088                for j, char in enumerate( line ): 
    91                     score = ord( char ) - quality_score_startswith    # 33 
     89                    score = ord( char ) - quality_score_startswith    # 64 
    9290                    qual = "%s%s " % ( qual, str( score ) ) 
    9391            outfile_score.write( '%s\n' % qual ) 
  • lib/galaxy/datatypes/converters/fastq_to_qual_converter.xml

    r1366 r1367  
    22  <command interpreter="python">fastq_to_qual_converter.py $input1 $output1 $input1.extension</command> 
    33  <inputs> 
    4     <param format="fastq,fastqsolexa" name="input1" type="data" label="Choose Fastq file"/> 
     4    <param format="fastqsolexa" name="input1" type="data" label="Choose Fastq file"/> 
    55  </inputs> 
    66  <outputs> 
  • lib/galaxy/datatypes/registry.py

    r1337 r1367  
    6868                'customtrack' : interval.CustomTrack(), 
    6969                'fasta'       : sequence.Fasta(), 
    70                 'fastq'       : sequence.Fastq(), 
     70                'fastqsolexa' : sequence.FastqSolexa(), 
    7171                'gff'         : interval.Gff(), 
    7272                'gff3'        : interval.Gff3(),   
     
    9090                'customtrack' : 'text/plain', 
    9191                'fasta'       : 'text/plain', 
    92                 'fastq'      : 'text/plain', 
     92                'fastqsolexa' : 'text/plain', 
    9393                'gff'         : 'text/plain', 
    9494                'gff3'        : 'text/plain', 
  • lib/galaxy/datatypes/sequence.py

    r1345 r1367  
    9090            return False 
    9191 
    92 class Fastq( Sequence ): 
    93     """Class representing a FASTQ sequence ( the Sanger/Standard variant )""" 
    94     file_ext = "fastq" 
    95  
    96     def set_peek( self, dataset ): 
    97         Sequence.set_peek( self, dataset ) 
    98         count = 0 
    99         size = 0 
    100         bases_regexp = re.compile("^[NGTAC]*$") 
    101         for line in file( dataset.file_name ): 
    102             if line and line.startswith( ">" ): 
    103                 count += 1 
    104             elif bases_regexp.match( line ): 
    105                 line = line.strip() 
    106                 size += len( line ) 
    107         if count == 1: 
    108             dataset.blurb = '%d bases' % size 
    109         else: 
    110             dataset.blurb = '%d sequences' % count 
    111  
    112     def sniff(self, filename): 
    113         """ 
    114         Determines whether the file is in fastq format ( the Sanger/Standard variant ) 
    115         For details, see http://maq.sourceforge.net/fastq.shtml 
    116  
    117         Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa 
    118               These differ in the representation of the quality scores 
    119  
    120         >>> fname = get_test_fname( '1.fastq' ) 
    121         >>> Fastq().sniff( fname ) 
    122         True 
    123         >>> fname = get_test_fname( '1.fastqsolexa' ) 
    124         >>> Fastq().sniff( fname ) 
    125         False 
    126         """ 
    127         headers = get_headers( filename, None ) 
    128         bases_regexp = re.compile( "^[NGTAC]*$" ) 
    129         try: 
    130             if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0] and headers[3][0]: 
    131                 # Check the sequence line, make sure it contains only G/C/A/T/N 
    132                 if not bases_regexp.match( headers[1][0] ): 
    133                     return False 
    134                 # The quality score line 
    135                 qscore = headers[3][0] 
    136                 # In Standard/Sanger format, the quality score is a single string, whose length should be equal to the length of the sequence 
    137                 if len( qscore ) != len( headers[1][0] ): 
    138                     return False  
    139                 #Check the quality score values - in Sanger/Standard these should be ASCII characters between "!" (0x21) and "~" (0x7E) 
    140                 for x in qscore: 
    141                     if ord( x ) < 0x21 or ord( x ) > 0x7e: 
    142                         return False 
    143                 return True 
    144             return False 
    145         except: 
    146             return False 
    14792 
    14893class FastqSolexa( Sequence ): 
     
    155100        bases_regexp = re.compile("^[NGTAC]*$") 
    156101        for line in file( dataset.file_name ): 
    157             if line and line[0] == ">": 
     102            if line and line[0] == "@": 
    158103                count += 1 
    159104            elif bases_regexp.match(line): 
     
    175120        >>> fname = get_test_fname( '1.fastq' ) 
    176121        >>> FastqSolexa().sniff( fname ) 
    177         Fals
     122        Tru
    178123        >>> fname = get_test_fname( '1.fastqsolexa' ) 
    179124        >>> FastqSolexa().sniff( fname ) 
     
    187132                if not bases_regexp.match( headers[1][0] ): 
    188133                    return False 
    189                 qscore = headers[3] 
    190                 # In Solexa format, the quality score is a list of numbers, whose length should be equal to the length of the sequence 
    191                 if len( qscore ) != len( headers[1][0] )
    192                     return False 
    193                 # Check the quality score values - in Solexa/FASTQ these should be valid decimal numbers 
    194                 # (if "x" is not a valid number, "int" will raise an exception) 
    195                 for x in qscore: 
    196                     try: 
    197                         check = int( x ) 
    198                     except
     134                 
     135                # Check quality score: integer or ascii char. 
     136                try
     137                    check = int(headers[3][0]) 
     138                    qscore_int = True 
     139                except: 
     140                    qscore_int = False 
     141                 
     142                if qscore_int: 
     143                    if len( headers[3] ) != len( headers[1][0] )
    199144                        return False 
     145                else: 
     146                    if len( headers[3][0] ) != len( headers[1][0] ): 
     147                        return False                 
    200148                return True  
    201149            return False 
  • tools/data_source/upload.xml

    r1340 r1367  
    9090----- 
    9191 
    92 **Fastq** 
     92**FastqSolexa** 
    9393 
    94 Fastq format stores sequences and Phred qualities in a single file. We define Fastq as the Sanger/Standard variant:: 
     94Fastq format stores sequences and quality scores in a single file. We define FastqSolexa as the Illumina (Solexa) variant:: 
    9595 
    96         @EAS54_6_R1_2_1_413_324 
    97         CCCTTCTTGTCTTCAGCGTTTCTCC 
    98         + 
    99         ;;3;;;;;;;;;;;;7;;;;;;;88 
    100         @EAS54_6_R1_2_1_540_792 
    101         TTGGCAGGCCAAGGCCGATGGATCA 
    102         + 
    103         ;;;;;;;;;;;7;;;;;-;;;3;83 
    104         @EAS54_6_R1_2_1_443_348 
    105         GTTGCTTCTGGCGTGGGTGGGGGGG 
    106         +EAS54_6_R1_2_1_443_348 
    107         ;;;;;;;;;;;9;7;;.7;393333 
     96    @seq1   
     97    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT   
     98    +seq1   
     99    hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh   
     100    @seq2   
     101    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG   
     102    +seq2   
     103    hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO 
     104     
     105Or::  
    108106 
     107    @seq1 
     108    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT 
     109    +seq1 
     110    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 
     111    @seq2 
     112    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG 
     113    +seq2 
     114    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 
     115     
    109116----- 
    110117 
  • tools/metag_tools/fastq_to_fasta_qual.py

    r1366 r1367  
    3737    seq_title_startswith = '' 
    3838    qual_title_startswith = '' 
    39     default_coding_value = 33 
     39    default_coding_value = 64 
    4040    fastq_block_lines = 0 
    4141     
     
    9393                qual = line 
    9494            else: 
    95                 if datatype == 'fastqsolexa': 
    96                     outfile_seq.close() 
    97                     outfile_score.close() 
    98                     stop_err( "This tool currently only works with the fastq solexa variant if the socres are integers, not ascii." ) 
    9995                # ascii 
    10096                quality_score_length = len( line ) 
     
    108104                    stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 
    109105                for j, char in enumerate( line ): 
    110                     score = ord( char ) - qual_score_startswith    # 33 
     106                    score = ord( char ) - qual_score_startswith    # 64 
    111107                    qual = "%s%s " % ( qual, str( score ) ) 
    112108            outfile_score.write( '%s\n' % qual ) 
  • tools/metag_tools/fastq_to_fasta_qual.xml

    r1341 r1367  
    33  <command interpreter="python">fastq_to_fasta_qual.py $input1 $output1 $output2 $input1.extension</command> 
    44  <inputs> 
    5     <param name="input1" type="data" format="fastq,fastqsolexa" label="Fastq file"/> 
     5    <param name="input1" type="data" format="fastqsolexa" label="Fastq file"/> 
    66  </inputs> 
    77  <outputs> 
     
    1212    <!-- NOTE: this tool generates 2 output files, but our functional tests currently only handle the last one generated --> 
    1313    <test> 
    14       <param name="input1" value="1.fastq" ftype="fastq" /> 
     14      <param name="input1" value="1.fastq" ftype="fastqsolexa" /> 
    1515      <output name="output1" file="fastq_to_fasta_qual_out2.fasta" /> 
    1616    </test> 
    1717    <test> 
    18       <param name="input1" value="1.fastqsolexa" ftype="fastq" /> 
     18      <param name="input1" value="1.fastqsolexa" ftype="fastqsolexa" /> 
    1919      <output name="output1" file="fastq_to_fasta_qual_out4.fasta" /> 
    2020    </test> 
     
    2424.. class:: warningmark 
    2525 
    26 IMPORTANT: With the Fastq Solexa variant, this tool currently only works with data where the quality scores are integers, ASCII quality scores are not supported. 
     26IMPORTANT: This tool currently only support data where the quality scores are integers or ASCII quality scores with base 64.   
    2727 
    2828----- 
     
    3030**What it does** 
    3131 
    32 This tool extracts sequences and quality scores from FASTQ data ( both Sanger/Standard and Solexa variants ), producing a FASTA dataset and a QUAL dataset.  With the Solexa variant, this tool currently only works with data where the quality scores are integers, ASCII quality scores are not supported
     32This tool extracts sequences and quality scores from FASTQ data ( Solexa variants ), producing a FASTA dataset and a QUAL dataset
    3333 
    3434----- 
     
    3838- Converting the following Sanger/Standard fastq data:: 
    3939 
    40     @EAS54_6_R1_2_1_413_324 
    41     CCCTTCTTGTCTTCAGCGTTTCTCC 
    42     + 
    43     ;;3;;;;;;;;;;;;7;;;;;;;88 
    44     @EAS54_6_R1_2_1_540_792 
    45     TTGGCAGGCCAAGGCCGATGGATCA 
    46     + 
    47     ;;;;;;;;;;;7;;;;;-;;;3;83 
    48     @EAS54_6_R1_2_1_443_348 
    49     GTTGCTTCTGGCGTGGGTGGGGGGG 
    50     +EAS54_6_R1_2_1_443_348 
    51     ;;;;;;;;;;;9;7;;.7;393333 
    52  
     40    @seq1   
     41    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT   
     42    +seq1   
     43    hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh   
     44    @seq2   
     45    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG   
     46    +seq2   
     47    hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO 
    5348 
    5449- will extract the following sequences:: 
    5550 
    56     >EAS54_6_R1_2_1_413_324 
    57     CCCTTCTTGTCTTCAGCGTTTCTCC 
    58     >EAS54_6_R1_2_1_540_792 
    59     TTGGCAGGCCAAGGCCGATGGATCA 
    60     >EAS54_6_R1_2_1_443_348 
    61     GTTGCTTCTGGCGTGGGTGGGGGGG 
    62  
     51    >seq1 
     52    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT 
     53    >seq2 
     54    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG 
     55     
    6356- and quality scores:: 
    6457 
    65     >EAS54_6_R1_2_1_413_324 
    66     26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 26 26 23 23  
    67     >EAS54_6_R1_2_1_540_792 
    68     26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 26 18 26 23 18  
    69     >EAS54_6_R1_2_1_443_348 
    70     26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 24 18 18 18 18  
    71  
     58    >seq1 
     59    40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40  
     60    >seq2 
     61    40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15  
    7262 
    7363**Example2** 
  • universe_wsgi.ini.sample

    r1357 r1367  
    176176data = galaxy.datatypes.data:Data,application/octet-stream 
    177177fasta = galaxy.datatypes.sequence:Fasta,display_in_upload 
    178 fastq = galaxy.datatypes.sequence:Fastq,display_in_upload 
    179178fastqsolexa = galaxy.datatypes.sequence:FastqSolexa,display_in_upload 
    180179gff = galaxy.datatypes.interval:Gff,display_in_upload