Changeset 1367:2328b62c0846
- Timestamp:
- 06/06/08 14:52:11 (7 months ago)
- Files:
-
- datatype_converters_conf.xml.sample (modified) (1 diff)
- lib/galaxy/datatypes/converters/fastq_to_fasta_converter.xml (modified) (1 diff)
- lib/galaxy/datatypes/converters/fastq_to_qual_converter.py (modified) (3 diffs)
- lib/galaxy/datatypes/converters/fastq_to_qual_converter.xml (modified) (1 diff)
- lib/galaxy/datatypes/registry.py (modified) (2 diffs)
- lib/galaxy/datatypes/sequence.py (modified) (4 diffs)
- tools/data_source/upload.xml (modified) (1 diff)
- tools/metag_tools/fastq_to_fasta_qual.py (modified) (3 diffs)
- tools/metag_tools/fastq_to_fasta_qual.xml (modified) (5 diffs)
- universe_wsgi.ini.sample (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
datatype_converters_conf.xml.sample
r1366 r1367 3 3 <converter file="bed_to_gff_converter.xml" source_datatype="bed" target_datatype="gff"/> 4 4 <converter file="fasta_to_tabular_converter.xml" source_datatype="fasta" target_datatype="tabular"/> 5 <converter file="fastq_to_fasta_converter.xml" source_datatype="fastq ,fastqsolexa" target_datatype="fasta"/>6 <converter file="fastq_to_qual_converter.xml" source_datatype="fastq ,fastqsolexa" target_datatype="qual"/>5 <converter file="fastq_to_fasta_converter.xml" source_datatype="fastqsolexa" target_datatype="fasta"/> 6 <converter file="fastq_to_qual_converter.xml" source_datatype="fastqsolexa" target_datatype="qual"/> 7 7 <converter file="gff_to_bed_converter.xml" source_datatype="gff" target_datatype="bed"/> 8 8 <converter file="interval_to_bed_converter.xml" source_datatype="interval" target_datatype="bed"/> lib/galaxy/datatypes/converters/fastq_to_fasta_converter.xml
r1366 r1367 3 3 <command interpreter="python">fastq_to_fasta_converter.py $input $output</command> 4 4 <inputs> 5 <param name="input" type="data" format="fastq ,fastqsolexa" label="Choose Fastq file"/>5 <param name="input" type="data" format="fastqsolexa" label="Choose Fastq file"/> 6 6 </inputs> 7 7 <outputs> lib/galaxy/datatypes/converters/fastq_to_qual_converter.py
r1366 r1367 29 29 qual_title_startswith = '' 30 30 seq_title_startswith = '' 31 default_coding_value = 3331 default_coding_value = 64 32 32 fastq_block_lines = 0 33 33 … … 76 76 if fastq_integer: # digits 77 77 qual = line 78 else: # ascii 79 if datatype == 'fastqsolexa': 80 outfile_score.close() 81 stop_err( "This tool currently only works with the fastq solexa variant if the socres are integers, not ascii." ) 78 else: 79 # ascii 82 80 quality_score_length = len( line ) 83 81 if quality_score_length == read_length + 1: … … 89 87 stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 90 88 for j, char in enumerate( line ): 91 score = ord( char ) - quality_score_startswith # 3389 score = ord( char ) - quality_score_startswith # 64 92 90 qual = "%s%s " % ( qual, str( score ) ) 93 91 outfile_score.write( '%s\n' % qual ) lib/galaxy/datatypes/converters/fastq_to_qual_converter.xml
r1366 r1367 2 2 <command interpreter="python">fastq_to_qual_converter.py $input1 $output1 $input1.extension</command> 3 3 <inputs> 4 <param format="fastq ,fastqsolexa" name="input1" type="data" label="Choose Fastq file"/>4 <param format="fastqsolexa" name="input1" type="data" label="Choose Fastq file"/> 5 5 </inputs> 6 6 <outputs> lib/galaxy/datatypes/registry.py
r1337 r1367 68 68 'customtrack' : interval.CustomTrack(), 69 69 'fasta' : sequence.Fasta(), 70 'fastq ' : sequence.Fastq(),70 'fastqsolexa' : sequence.FastqSolexa(), 71 71 'gff' : interval.Gff(), 72 72 'gff3' : interval.Gff3(), … … 90 90 'customtrack' : 'text/plain', 91 91 'fasta' : 'text/plain', 92 'fastq ': 'text/plain',92 'fastqsolexa' : 'text/plain', 93 93 'gff' : 'text/plain', 94 94 'gff3' : 'text/plain', lib/galaxy/datatypes/sequence.py
r1345 r1367 90 90 return False 91 91 92 class Fastq( Sequence ):93 """Class representing a FASTQ sequence ( the Sanger/Standard variant )"""94 file_ext = "fastq"95 96 def set_peek( self, dataset ):97 Sequence.set_peek( self, dataset )98 count = 099 size = 0100 bases_regexp = re.compile("^[NGTAC]*$")101 for line in file( dataset.file_name ):102 if line and line.startswith( ">" ):103 count += 1104 elif bases_regexp.match( line ):105 line = line.strip()106 size += len( line )107 if count == 1:108 dataset.blurb = '%d bases' % size109 else:110 dataset.blurb = '%d sequences' % count111 112 def sniff(self, filename):113 """114 Determines whether the file is in fastq format ( the Sanger/Standard variant )115 For details, see http://maq.sourceforge.net/fastq.shtml116 117 Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa118 These differ in the representation of the quality scores119 120 >>> fname = get_test_fname( '1.fastq' )121 >>> Fastq().sniff( fname )122 True123 >>> fname = get_test_fname( '1.fastqsolexa' )124 >>> Fastq().sniff( fname )125 False126 """127 headers = get_headers( filename, None )128 bases_regexp = re.compile( "^[NGTAC]*$" )129 try:130 if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0] and headers[3][0]:131 # Check the sequence line, make sure it contains only G/C/A/T/N132 if not bases_regexp.match( headers[1][0] ):133 return False134 # The quality score line135 qscore = headers[3][0]136 # In Standard/Sanger format, the quality score is a single string, whose length should be equal to the length of the sequence137 if len( qscore ) != len( headers[1][0] ):138 return False139 #Check the quality score values - in Sanger/Standard these should be ASCII characters between "!" (0x21) and "~" (0x7E)140 for x in qscore:141 if ord( x ) < 0x21 or ord( x ) > 0x7e:142 return False143 return True144 return False145 except:146 return False147 92 148 93 class FastqSolexa( Sequence ): … … 155 100 bases_regexp = re.compile("^[NGTAC]*$") 156 101 for line in file( dataset.file_name ): 157 if line and line[0] == " >":102 if line and line[0] == "@": 158 103 count += 1 159 104 elif bases_regexp.match(line): … … 175 120 >>> fname = get_test_fname( '1.fastq' ) 176 121 >>> FastqSolexa().sniff( fname ) 177 False122 True 178 123 >>> fname = get_test_fname( '1.fastqsolexa' ) 179 124 >>> FastqSolexa().sniff( fname ) … … 187 132 if not bases_regexp.match( headers[1][0] ): 188 133 return False 189 qscore = headers[3]190 # In Solexa format, the quality score is a list of numbers, whose length should be equal to the length of the sequence191 if len( qscore ) != len( headers[1][0] ):192 return False193 # Check the quality score values - in Solexa/FASTQ these should be valid decimal numbers194 # (if "x" is not a valid number, "int" will raise an exception)195 for x in qscore:196 try:197 check = int( x )198 except:134 135 # Check quality score: integer or ascii char. 136 try: 137 check = int(headers[3][0]) 138 qscore_int = True 139 except: 140 qscore_int = False 141 142 if qscore_int: 143 if len( headers[3] ) != len( headers[1][0] ): 199 144 return False 145 else: 146 if len( headers[3][0] ) != len( headers[1][0] ): 147 return False 200 148 return True 201 149 return False tools/data_source/upload.xml
r1340 r1367 90 90 ----- 91 91 92 **Fastq **92 **FastqSolexa** 93 93 94 Fastq format stores sequences and Phred qualities in a single file. We define Fastq as the Sanger/Standardvariant::94 Fastq format stores sequences and quality scores in a single file. We define FastqSolexa as the Illumina (Solexa) variant:: 95 95 96 @EAS54_6_R1_2_1_413_324 97 CCCTTCTTGTCTTCAGCGTTTCTCC 98 + 99 ;;3;;;;;;;;;;;;7;;;;;;;88 100 @EAS54_6_R1_2_1_540_792 101 TTGGCAGGCCAAGGCCGATGGATCA 102 + 103 ;;;;;;;;;;;7;;;;;-;;;3;83 104 @EAS54_6_R1_2_1_443_348 105 GTTGCTTCTGGCGTGGGTGGGGGGG 106 +EAS54_6_R1_2_1_443_348 107 ;;;;;;;;;;;9;7;;.7;393333 96 @seq1 97 GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT 98 +seq1 99 hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh 100 @seq2 101 GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG 102 +seq2 103 hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO 104 105 Or:: 108 106 107 @seq1 108 GAATTGATCAGGACATAGGACAACTGTAGGCACCAT 109 +seq1 110 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 111 @seq2 112 GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG 113 +seq2 114 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 115 109 116 ----- 110 117 tools/metag_tools/fastq_to_fasta_qual.py
r1366 r1367 37 37 seq_title_startswith = '' 38 38 qual_title_startswith = '' 39 default_coding_value = 3339 default_coding_value = 64 40 40 fastq_block_lines = 0 41 41 … … 93 93 qual = line 94 94 else: 95 if datatype == 'fastqsolexa':96 outfile_seq.close()97 outfile_score.close()98 stop_err( "This tool currently only works with the fastq solexa variant if the socres are integers, not ascii." )99 95 # ascii 100 96 quality_score_length = len( line ) … … 108 104 stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 109 105 for j, char in enumerate( line ): 110 score = ord( char ) - qual_score_startswith # 33106 score = ord( char ) - qual_score_startswith # 64 111 107 qual = "%s%s " % ( qual, str( score ) ) 112 108 outfile_score.write( '%s\n' % qual ) tools/metag_tools/fastq_to_fasta_qual.xml
r1341 r1367 3 3 <command interpreter="python">fastq_to_fasta_qual.py $input1 $output1 $output2 $input1.extension</command> 4 4 <inputs> 5 <param name="input1" type="data" format="fastq ,fastqsolexa" label="Fastq file"/>5 <param name="input1" type="data" format="fastqsolexa" label="Fastq file"/> 6 6 </inputs> 7 7 <outputs> … … 12 12 <!-- NOTE: this tool generates 2 output files, but our functional tests currently only handle the last one generated --> 13 13 <test> 14 <param name="input1" value="1.fastq" ftype="fastq " />14 <param name="input1" value="1.fastq" ftype="fastqsolexa" /> 15 15 <output name="output1" file="fastq_to_fasta_qual_out2.fasta" /> 16 16 </test> 17 17 <test> 18 <param name="input1" value="1.fastqsolexa" ftype="fastq " />18 <param name="input1" value="1.fastqsolexa" ftype="fastqsolexa" /> 19 19 <output name="output1" file="fastq_to_fasta_qual_out4.fasta" /> 20 20 </test> … … 24 24 .. class:: warningmark 25 25 26 IMPORTANT: With the Fastq Solexa variant, this tool currently only works with data where the quality scores are integers, ASCII quality scores are not supported.26 IMPORTANT: This tool currently only support data where the quality scores are integers or ASCII quality scores with base 64. 27 27 28 28 ----- … … 30 30 **What it does** 31 31 32 This tool extracts sequences and quality scores from FASTQ data ( both Sanger/Standard and Solexa variants ), producing a FASTA dataset and a QUAL dataset. With the Solexa variant, this tool currently only works with data where the quality scores are integers, ASCII quality scores are not supported.32 This tool extracts sequences and quality scores from FASTQ data ( Solexa variants ), producing a FASTA dataset and a QUAL dataset. 33 33 34 34 ----- … … 38 38 - Converting the following Sanger/Standard fastq data:: 39 39 40 @EAS54_6_R1_2_1_413_324 41 CCCTTCTTGTCTTCAGCGTTTCTCC 42 + 43 ;;3;;;;;;;;;;;;7;;;;;;;88 44 @EAS54_6_R1_2_1_540_792 45 TTGGCAGGCCAAGGCCGATGGATCA 46 + 47 ;;;;;;;;;;;7;;;;;-;;;3;83 48 @EAS54_6_R1_2_1_443_348 49 GTTGCTTCTGGCGTGGGTGGGGGGG 50 +EAS54_6_R1_2_1_443_348 51 ;;;;;;;;;;;9;7;;.7;393333 52 40 @seq1 41 GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT 42 +seq1 43 hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh 44 @seq2 45 GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG 46 +seq2 47 hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO 53 48 54 49 - will extract the following sequences:: 55 50 56 >EAS54_6_R1_2_1_413_324 57 CCCTTCTTGTCTTCAGCGTTTCTCC 58 >EAS54_6_R1_2_1_540_792 59 TTGGCAGGCCAAGGCCGATGGATCA 60 >EAS54_6_R1_2_1_443_348 61 GTTGCTTCTGGCGTGGGTGGGGGGG 62 51 >seq1 52 GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT 53 >seq2 54 GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG 55 63 56 - and quality scores:: 64 57 65 >EAS54_6_R1_2_1_413_324 66 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 26 26 23 23 67 >EAS54_6_R1_2_1_540_792 68 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 26 18 26 23 18 69 >EAS54_6_R1_2_1_443_348 70 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 24 18 18 18 18 71 58 >seq1 59 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40 60 >seq2 61 40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15 72 62 73 63 **Example2** universe_wsgi.ini.sample
r1357 r1367 176 176 data = galaxy.datatypes.data:Data,application/octet-stream 177 177 fasta = galaxy.datatypes.sequence:Fasta,display_in_upload 178 fastq = galaxy.datatypes.sequence:Fastq,display_in_upload179 178 fastqsolexa = galaxy.datatypes.sequence:FastqSolexa,display_in_upload 180 179 gff = galaxy.datatypes.interval:Gff,display_in_upload