Changeset 1340:1c3b45fb0562
- Timestamp:
- 05/29/08 15:12:36 (7 months ago)
- Files:
-
- lib/galaxy/datatypes/converters/fastq_to_fasta_converter.py (modified) (2 diffs)
- lib/galaxy/datatypes/converters/fastq_to_fasta_converter.xml (modified) (1 diff)
- lib/galaxy/datatypes/converters/fastq_to_qual_converter.py (modified) (3 diffs)
- lib/galaxy/datatypes/converters/fastq_to_qual_converter.xml (modified) (1 diff)
- lib/galaxy/datatypes/sequence.py (modified) (3 diffs)
- lib/galaxy/datatypes/test/1.fastq (added)
- lib/galaxy/datatypes/test/1.fastqsolexa (added)
- test-data/1.fastqsolexa (added)
- test-data/2.fastq (added)
- test-data/fastq_to_fasta_qual_out2.fasta (moved) (moved from test-data/convert_fastq2fasta_out2.fasta)
- test-data/fastq_to_fasta_qual_out4.fasta (added)
- test/functional/test_sniffing_and_metadata_settings.py (modified) (1 diff)
- tool_conf.xml.sample (modified) (1 diff)
- tools/data_source/upload.xml (modified) (5 diffs)
- tools/metag_tools/fastq_to_fasta_qual.py (moved) (moved from tools/metag_tools/convert_fastq2fasta.py) (1 diff)
- tools/metag_tools/fastq_to_fasta_qual.xml (moved) (moved from tools/metag_tools/convert_fastq2fasta.xml) (5 diffs)
- universe_wsgi.ini.sample (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
lib/galaxy/datatypes/converters/fastq_to_fasta_converter.py
r1333 r1340 1 1 #! /usr/bin/python 2 3 2 """ 4 3 convert fastq file to separated sequence and quality files. … … 21 20 assert sys.version_info[:2] >= ( 2, 4 ) 22 21 23 24 22 def stop_err( msg ): 25 26 sys.stderr.write( "%s\n" % msg ) 23 sys.stderr.write( "%s" % msg ) 27 24 sys.exit() 28 25 26 def __main__(): 27 infile_name = sys.argv[1] 28 outfile = open( sys.argv[2], 'w' ) 29 fastq_block_lines = 0 30 seq_title_startswith = '' 29 31 30 if __name__ == '__main__': 31 32 # file I/O 33 infile = sys.argv[1] 34 outfile_seq = open(sys.argv[2], 'w') 35 36 37 # guessing the first char used in title lines 38 leading_char_seq_title = '' 39 40 every_four_lines = 0 41 42 for i, line in enumerate(file(infile)): 43 44 line = line.rstrip() # get rid of the newline and spaces 45 46 if ((not line) or (line.startswith('#'))): continue # comments 47 48 every_four_lines = (every_four_lines + 1) % 4 49 leading_char = line[0:1] 50 51 if every_four_lines == 1: # first line is expected to be read title 52 if not leading_char_seq_title: 53 leading_char_seq_title = leading_char 54 if leading_char != leading_char_seq_title: 55 stop_err('Invalid fastq format at line %d.' %(i)) 56 read_title = line[1:] 57 outfile_seq.write('>%s\n' %(line[1:])) 58 59 elif every_four_lines == 2: # second line is expected to be read 60 read_length = len(line) 61 outfile_seq.write('%s\n' %(line)) 62 32 for i, line in enumerate( file( infile_name ) ): 33 line = line.rstrip() # eliminate trailing space and new line characters 34 if not line or line.startswith( '#' ): 35 continue 36 fastq_block_lines = ( fastq_block_lines + 1 ) % 4 37 line_startswith = line[0:1] 38 if fastq_block_lines == 1: 39 # line 1 is sequence title 40 if not seq_title_startswith: 41 seq_title_startswith = line_startswith 42 if seq_title_startswith != line_startswith: 43 stop_err( 'Invalid fastq format at line %d: %s.' %( i + 1, line ) ) 44 read_title = line[ 1: ] 45 outfile.write( '>%s\n' % line[1:] ) 46 elif fastq_block_lines == 2: 47 # line 2 is nucleotides 48 read_length = len( line ) 49 outfile.write( '%s\n' % line ) 63 50 else: 64 51 pass 65 52 66 outfile _seq.close()53 outfile.close() 67 54 68 69 55 if __name__ == "__main__": __main__() lib/galaxy/datatypes/converters/fastq_to_fasta_converter.xml
r1333 r1340 1 <tool id="CONVERTER_fastq_to_fasta_0" name=" FASTQ-to-FASTA" version="1.0.0">2 <description>converts F ASTQ file to FASTAformat</description>1 <tool id="CONVERTER_fastq_to_fasta_0" name="Convert Fastq to Fasta" version="1.0.0"> 2 <description>converts Fastq file to Fasta format</description> 3 3 <command interpreter="python">fastq_to_fasta_converter.py $input $output</command> 4 4 <inputs> 5 <param name="input" type="data" format="fastq" label=" Fastq file"/>5 <param name="input" type="data" format="fastq" label="Choose Fastq file"/> 6 6 </inputs> 7 7 <outputs> lib/galaxy/datatypes/converters/fastq_to_qual_converter.py
r1333 r1340 1 1 #! /usr/bin/python 2 3 2 """ 4 3 convert fastq file to separated sequence and quality files. … … 15 14 %python convert_fastq2fasta.py <your_fastq_filename> <output_seq_filename> <output_score_filename> 16 15 """ 17 18 16 import sys, os 19 17 from math import * … … 21 19 assert sys.version_info[:2] >= ( 2, 4 ) 22 20 23 24 21 def stop_err( msg ): 25 26 sys.stderr.write( "%s\n" % msg ) 22 sys.stderr.write( "%s" % msg ) 27 23 sys.exit() 28 24 29 30 if __name__ == '__main__': 31 32 # file I/O 33 infile = sys.argv[1] 34 outfile_score = open(sys.argv[2], 'w') 25 def __main__(): 26 infile_name = sys.argv[1] 27 outfile_score = open( sys.argv[2], 'w' ) 28 qual_title_startswith = '' 29 seq_title_startswith = '' 30 default_coding_value = 64 31 fastq_block_lines = 0 35 32 36 # guessing the first char used in title lines 37 leading_char_quality_title = '' 38 leading_char_seq_title = '' 39 default_coding_value = 64 40 41 every_four_lines = 0 42 43 for i, line in enumerate(file(infile)): 44 45 line = line.rstrip() # get rid of the newline and spaces 46 47 if ((not line) or (line.startswith('#'))): continue # comments 48 49 every_four_lines = (every_four_lines + 1) % 4 50 leading_char = line[0:1] 51 52 if every_four_lines == 1: # first line is expected to be read title 53 if not leading_char_seq_title: 54 leading_char_seq_title = leading_char 55 if leading_char != leading_char_seq_title: 56 stop_err('Invalid fastq format at line %d.' %(i)) 33 for i, line in enumerate( file( infile_name ) ): 34 line = line.rstrip() 35 if not line or line.startswith( '#' ): 36 continue 37 fastq_block_lines = ( fastq_block_lines + 1 ) % 4 38 line_startswith = line[0:1] 39 if fastq_block_lines == 1: 40 # first line is @title_of_seq 41 if not seq_title_startswith: 42 seq_title_startswith = line_startswith 43 if line_startswith != seq_title_startswith: 44 stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) ) 57 45 read_title = line[1:] 58 59 elif every_four_lines == 2: # second line is expected to be read60 read_length = len( line)61 62 elif every_four_lines == 3: # third line is expected to be quality title63 if not leading_char_quality_title:64 leading_char_quality_title = leading_char65 if l eading_char != leading_char_quality_title:66 stop_err( 'Invalid fastq format at line %d.' %(i))46 elif fastq_block_lines == 2: 47 # second line is nucleotides 48 read_length = len( line ) 49 elif fastq_block_lines == 3: 50 # third line is +title_of_qualityscore (might be skipped) 51 if not qual_title_startswith: 52 qual_title_startswith = line_startswith 53 if line_startswith != qual_title_startswith: 54 stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) ) 67 55 quality_title = line[1:] 68 69 if (quality_title and (read_title != quality_title)): 70 stop_err('Invalid fastq format: titles for sequence and quality score are different.') 71 56 if quality_title and read_title != quality_title: 57 stop_err( 'Invalid fastq format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) 72 58 if not quality_title: 73 outfile_score.write( '>%s\n' %(read_title))59 outfile_score.write( '>%s\n' % read_title ) 74 60 else: 75 outfile_score.write( '>%s\n' %(line[1:]))76 77 else: # fourth line is expected to be the ASCII-coded quality scores61 outfile_score.write( '>%s\n' % line[1:] ) 62 else: 63 # fourth line is quality scores 78 64 qual = '' 79 80 # peek: ascii code or digits? 81 first_value = line.split()[0] 82 83 if first_value.isdigit(): 65 # peek: ascii or digits? 66 val = line.split()[0] 67 if val.isdigit(): 84 68 # digits 85 69 qual = line 86 70 else: 87 # ascii code 88 # guess leading char 89 quality_score_length = len(line) 90 if quality_score_length == (read_length+1): # first char is leading_char_score 91 leading_char_score = ord(line[0:1]) 71 # ascii 72 quality_score_length = len( line ) 73 if quality_score_length == read_length + 1: 74 quality_score_startswith = ord( line[0:1] ) 92 75 line = line[1:] 93 76 elif quality_score_length == read_length: 94 leading_char_score = default_coding_value # default77 quality_score_startswith = default_coding_value 95 78 else: 96 stop_err('Invalid fastq format: the number of quality scores is not the same as bases.') 97 98 for j, char in enumerate(line): 99 score = ord(char)-leading_char_score # 64 100 qual += (str(score) + ' ') 101 102 outfile_score.write('%s\n' %(qual)) 79 stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 80 for j, char in enumerate( line ): 81 score = ord( char ) - quality_score_startswith # 64 82 qual = "%s%s " % ( qual, str( score ) ) 83 outfile_score.write( '%s\n' % qual ) 103 84 104 85 outfile_score.close() 86 87 if __name__ == "__main__": __main__() 105 88 106 lib/galaxy/datatypes/converters/fastq_to_qual_converter.xml
r1333 r1340 1 <tool id="CONVERTER_fastq_to_qual_0" name=" FASTQ-to-FASTA">2 <command interpreter="python">fastq_to_qual_converter.py $input1 $output1 </command>1 <tool id="CONVERTER_fastq_to_qual_0" name="Convert Fastq to Qual"> 2 <command interpreter="python">fastq_to_qual_converter.py $input1 $output1</command> 3 3 <inputs> 4 <param format="fastq" name="input1" type="data" label=" Fastq file"/>4 <param format="fastq" name="input1" type="data" label="Choose Fastq file"/> 5 5 </inputs> 6 6 <outputs> lib/galaxy/datatypes/sequence.py
r1328 r1340 5 5 import data 6 6 import logging 7 import re 7 8 from cgi import escape 8 9 from galaxy.datatypes.metadata import MetadataElement … … 90 91 91 92 class Fastq( Sequence ): 92 """Class representing a FASTQ sequence""" 93 # FASTQ format stores sequences and Phred qualities in a single file. It is concise and compact. 94 # FASTQ is first widely used in the Sanger Institute and therefore we usually take the Sanger 95 # specification and the standard FASTQ format, or simply FASTQ format. Although Solexa/Illumina 96 # read file looks pretty much like FASTQ, they are different in that the qualities are scaled 97 # differently. In the quality string, if you can see a character with its ASCII code higher than 98 # 90, probably your file is in the Solexa/Illumina format. 99 # 100 # For details, see http://maq.sourceforge.net/fastq.shtml 93 """Class representing a FASTQ sequence ( the Sanger/Standard variant )""" 101 94 file_ext = "fastq" 102 95 103 96 def set_peek( self, dataset ): 104 97 Sequence.set_peek( self, dataset ) 105 sequences = 0 106 scores = 0 98 count = 0 99 size = 0 100 bases_regexp = re.compile("^[NGTAC]*$") 107 101 for line in file( dataset.file_name ): 108 if line: 109 if line.startswith( "@" ): 110 sequences += 1 111 elif line.startswith( '+' ): 112 scores += 1 113 dataset.blurb = '%d sequences, %d quality scores' % ( sequences, scores ) 102 if line and line.startswith( ">" ): 103 count += 1 104 elif bases_regexp.match( line ): 105 line = line.strip() 106 size += len( line ) 107 if count == 1: 108 dataset.blurb = '%d bases' % size 109 else: 110 dataset.blurb = '%d sequences' % count 111 112 def sniff(self, filename): 113 """ 114 Determines whether the file is in fastq format ( the Sanger/Standard variant ) 115 For details, see http://maq.sourceforge.net/fastq.shtml 116 117 Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa 118 These differ in the representation of the quality scores 119 120 >>> fname = get_test_fname( '1.fastq' ) 121 >>> Fastq().sniff( fname ) 122 True 123 >>> fname = get_test_fname( '1.fastqsolexa' ) 124 >>> Fastq().sniff( fname ) 125 False 126 """ 127 headers = get_headers( filename, None ) 128 bases_regexp = re.compile( "^[NGTAC]*$" ) 129 try: 130 if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0] and headers[3][0]: 131 # Check the sequence line, make sure it contains only G/C/A/T/N 132 if not bases_regexp.match( headers[1][0] ): 133 return False 134 # The quality score line 135 qscore = headers[3][0] 136 # In Standard/Sanger format, the quality score is a single string, whose length should be equal to the length of the sequence 137 if len( qscore ) != len( headers[1][0] ): 138 return False 139 #Check the quality score values - in Sanger/Standard these should be ASCII characters between "!" (0x21) and "~" (0x7E) 140 for x in qscore: 141 if ord( x ) < 0x21 or ord( x ) > 0x7e: 142 return False 143 return True 144 return False 145 except: 146 return False 147 148 class FastqSolexa( Sequence ): 149 """Class representing a FASTQ sequence ( the Solexa variant )""" 150 file_ext = "fastqsolexa" 151 152 def set_peek( self, dataset ): 153 Sequence.set_peek( self, dataset ) 154 count = size = 0 155 bases_regexp = re.compile("^[NGTAC]*$") 156 for line in file( dataset.file_name ): 157 if line and line[0] == ">": 158 count += 1 159 elif bases_regexp.match(line): 160 line = line.strip() 161 size += len(line) 162 if count == 1: 163 dataset.blurb = '%d bases' % size 164 else: 165 dataset.blurb = '%d sequences' % count 166 167 def sniff( self, filename ): 168 """ 169 Determines whether the file is in fastq format (Solexa Variant) 170 For details, see http://maq.sourceforge.net/fastq.shtml 171 172 Note: There are two kinds of FASTQ files, known as "Sanger" (sometimes called "Standard") and Solexa 173 These differ in the representation of the quality scores 174 175 >>> fname = get_test_fname( '1.fastq' ) 176 >>> SolexaFastq().sniff( fname ) 177 False 178 >>> fname = get_test_fname( '1.fastqsolexa' ) 179 >>> SolexaFastq().sniff( fname ) 180 True 181 """ 182 headers = get_headers( filename, None ) 183 bases_regexp = re.compile( "^[NGTAC]*$" ) 184 try: 185 if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: 186 # Check the sequence line, make sure it contains only G/C/A/T/N 187 if not bases_regexp.match( headers[1][0] ): 188 return False 189 qscore = headers[3] 190 # In Solexa format, the quality score is a list of numbers, whose length should be equal to the length of the sequence 191 if len( qscore ) != len( headers[1][0] ): 192 return False 193 # Check the quality score values - in Solexa/FASTQ these should be valid decimal numbers 194 # (if "x" is not a valid number, "int" will raise an exception) 195 for x in qscore: 196 try: 197 check = int( x ) 198 except: 199 return False 200 return True 201 return False 202 except: 203 return False 114 204 115 205 try: 206 from galaxy import eggs 116 207 import pkg_resources; pkg_resources.require( "bx-python" ) 117 208 import bx.align.maf … … 280 371 except: 281 372 return False 282 283 test/functional/test_sniffing_and_metadata_settings.py
r661 r1340 35 35 self.check_history_for_string('1.fasta format: <span class="fasta">fasta</span>, database: \? Info: uploaded file') 36 36 self.check_metadata_for_string('value="1.fasta" value="\?" Change data type selected value="fasta" selected="yes"') 37 self.delete_history_item( 1 ) 38 def test_17_fastq_datatype( self ): 39 """Testing correctly sniffing fastq ( the Sanger/Standard variant ) data type upon upload""" 40 self.upload_file('1.fastq') 41 self.verify_dataset_correctness('1.fastq') 42 self.check_history_for_string('1.fastq format: <span class="fastq">fastq</span>, database: \? Info: uploaded fastq file') 43 self.delete_history_item( 1 ) 44 def test_18_fastq_datatype( self ): 45 """Testing correctly sniffing fastq ( the Solexa variant ) data type upon upload""" 46 self.upload_file('1.fastqsolexa') 47 self.verify_dataset_correctness('1.fastqsolexa') 48 self.check_history_for_string('1.fastqsolexa format: <span class="fastqsolexa">fastqsolexa</span>, database: \? Info: uploaded fastqsolexa file') 37 49 self.delete_history_item( 1 ) 38 50 def test_20_gff_datatype( self ): tool_conf.xml.sample
r1328 r1340 60 60 <tool file="filters/bed2gff.xml" /> 61 61 <tool file="fasta_tools/fasta_to_tabular.xml" /> 62 <tool file="metag_tools/ convert_fastq2fasta.xml" />62 <tool file="metag_tools/fastq_to_fasta_qual.xml" /> 63 63 <tool file="filters/gff2bed.xml" /> 64 64 <tool file="filters/lav_to_bed.xml" /> tools/data_source/upload.xml
r1338 r1340 26 26 **Auto-detect** 27 27 28 The system will attempt to detect A XT, FASTA, Gff, HTML, LAV, Maf, Tabular, Wiggle, BED and Interval (BED with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on different rows). You can still coerce the system to set your data to the format you think it should be (please send us a note if you see a case when a valid format is not detected). You can also upload valid files that are compressed (gzipped), which will automatically be decompressed upon upload.28 The system will attempt to detect Axt, Fasta, Fastq, Gff, Gff3, Html, Lav, Maf, Tabular, Wiggle, Bed and Interval (Bed with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on different rows). You can still coerce the system to set your data to the format you think it should be. You can also upload compressed files, which will automatically be decompressed. 29 29 30 30 ----- … … 36 36 ----- 37 37 38 **A XT**38 **Axt** 39 39 40 40 blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields. … … 48 48 ----- 49 49 50 **B ED**50 **Bed** 51 51 52 52 * Tab delimited format (tabular) … … 77 77 ----- 78 78 79 **F ASTA**79 **Fasta** 80 80 81 81 A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. The first character of the description line is a greater-than (">") symbol in the first column. All lines should be shorter than 80 charcters:: … … 87 87 tttcgtgcgtatag 88 88 tggcgcggtga 89 90 ----- 91 92 **Fastq** 93 94 Fastq format stores sequences and Phred qualities in a single file. We define Fastq as the Sanger/Standard variant:: 95 96 @EAS54_6_R1_2_1_413_324 97 CCCTTCTTGTCTTCAGCGTTTCTCC 98 + 99 ;;3;;;;;;;;;;;;7;;;;;;;88 100 @EAS54_6_R1_2_1_540_792 101 TTGGCAGGCCAAGGCCGATGGATCA 102 + 103 ;;;;;;;;;;;7;;;;;-;;;3;83 104 @EAS54_6_R1_2_1_443_348 105 GTTGCTTCTGGCGTGGGTGGGGGGG 106 +EAS54_6_R1_2_1_443_348 107 ;;;;;;;;;;;9;7;;.7;393333 89 108 90 109 ----- tools/metag_tools/fastq_to_fasta_qual.py
r1330 r1340 21 21 assert sys.version_info[:2] >= ( 2, 4 ) 22 22 23 24 23 def stop_err( msg ): 25 26 sys.stderr.write( "%s\n" % msg ) 24 sys.stderr.write( "%s" % msg ) 27 25 sys.exit() 28 26 29 30 if __name__ == '__main__': 31 32 # file I/O 33 infile = sys.argv[1] 34 outfile_seq = open(sys.argv[2], 'w') 35 outfile_score = open(sys.argv[3], 'w') 27 def __main__(): 28 infile_name = sys.argv[1] 29 outfile_seq = open( sys.argv[2], 'w' ) 30 outfile_score = open( sys.argv[3], 'w' ) 31 seq_title_startswith = '' 32 qual_title_startswith = '' 33 default_coding_value = 64 34 fastq_block_lines = 0 36 35 37 # guessing the first char used in title lines 38 leading_char_seq_title = '' 39 leading_char_quality_title = '' 40 default_coding_value = 64 41 42 every_four_lines = 0 43 44 for i, line in enumerate(file(infile)): 45 46 line = line.rstrip() # get rid of the newline and spaces 47 48 if ((not line) or (line.startswith('#'))): continue # comments 49 50 every_four_lines = (every_four_lines + 1) % 4 51 leading_char = line[0:1] 52 53 if every_four_lines == 1: # first line is expected to be read title 54 if not leading_char_seq_title: 55 leading_char_seq_title = leading_char 56 if leading_char != leading_char_seq_title: 57 stop_err('Invalid fastq format at line %d.' %(i)) 36 for i, line in enumerate( file( infile_name ) ): 37 line = line.rstrip() 38 if not line or line.startswith( '#' ): 39 continue 40 fastq_block_lines = ( fastq_block_lines + 1 ) % 4 41 line_startswith = line[0:1] 42 if fastq_block_lines == 1: 43 # first line is @title_of_seq 44 if not seq_title_startswith: 45 seq_title_startswith = line_startswith 46 if line_startswith != seq_title_startswith: 47 stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) ) 58 48 read_title = line[1:] 59 outfile_seq.write( '>%s\n' %(line[1:]))60 61 elif every_four_lines == 2: # second line is expected to be read62 read_length = len( line)63 outfile_seq.write( '%s\n' %(line))64 65 elif every_four_lines == 3: # third line is expected to be quality title66 if not leading_char_quality_title:67 leading_char_quality_title = leading_char68 if l eading_char != leading_char_quality_title:69 stop_err( 'Invalid fastq format at line %d.' %(i))49 outfile_seq.write( '>%s\n' % line[1:] ) 50 elif fastq_block_lines == 2: 51 # second line is nucleotides 52 read_length = len( line ) 53 outfile_seq.write( '%s\n' % line ) 54 elif fastq_block_lines == 3: 55 # third line is +title_of_qualityscore ( might be skipped ) 56 if not qual_title_startswith: 57 qual_title_startswith = line_startswith 58 if line_startswith != qual_title_startswith: 59 stop_err( 'Invalid fastq format at line %d: %s.' % ( i + 1, line ) ) 70 60 quality_title = line[1:] 71 72 if (quality_title and (read_title != quality_title)): 73 stop_err('Invalid fastq format: titles for sequence and quality score are different.') 74 61 if quality_title and read_title != quality_title: 62 stop_err( 'Invalid fastq format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) 75 63 if not quality_title: 76 outfile_score.write( '>%s\n' %(read_title))64 outfile_score.write( '>%s\n' % read_title ) 77 65 else: 78 outfile_score.write( '>%s\n' %(line[1:]))79 80 else: # fourth line is expected to be the ASCII-coded quality scores66 outfile_score.write( '>%s\n' % line[1:] ) 67 else: 68 # fourth line is quality scores 81 69 qual = '' 82 83 # peek: ascii code or digits? 84 first_value = line.split()[0] 85 86 if first_value.isdigit(): 70 # peek: ascii or digits? 71 val = line.split()[0] 72 if val.isdigit(): 87 73 # digits 88 74 qual = line 89 75 else: 90 # ascii code91 # guess leading char92 quality_score_length = len(line)93 if quality_score_length == (read_length+1): # first char is leading_char_score94 leading_char_score = ord(line[0:1])76 # ascii 77 quality_score_length = len( line ) 78 if quality_score_length == read_length + 1: 79 # first char is qual_score_startswith 80 qual_score_startswith = ord( line[0:1] ) 95 81 line = line[1:] 96 82 elif quality_score_length == read_length: 97 leading_char_score = default_coding_value # default83 qual_score_startswith = default_coding_value 98 84 else: 99 stop_err('Invalid fastq format: the number of quality scores is not the same as bases.') 100 101 for j, char in enumerate(line): 102 score = ord(char)-leading_char_score # 64 103 qual += (str(score) + ' ') 104 105 outfile_score.write('%s\n' %(qual)) 106 85 stop_err( 'Invalid fastq format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) 86 for j, char in enumerate( line ): 87 score = ord( char ) - qual_score_startswith # 64 88 qual = "%s%s " % ( qual, str( score ) ) 89 outfile_score.write( '%s\n' % qual ) 90 107 91 outfile_seq.close() 108 92 outfile_score.close() 93 94 if __name__ == "__main__": __main__() 109 95 110 tools/metag_tools/fastq_to_fasta_qual.xml
r1336 r1340 1 <tool id=" convert_fastq2fasta" name="FASTQ-to-FASTA" version="1.0.0">2 <description> converts FASTQ file to FASTA format</description>3 <command interpreter="python"> convert_fastq2fasta.py $input1 $output1 $output2</command>1 <tool id="fastq_to_fasta_qual" name="FASTQ-to-FASTA-QUAL" version="1.0.0"> 2 <description>extracts sequences and quality scores from FASTQ data</description> 3 <command interpreter="python">fastq_to_fasta_qual.py $input1 $output1 $output2</command> 4 4 <inputs> 5 <param name="input1" type="data" format="fastq " label="Fastq file"/>5 <param name="input1" type="data" format="fastq,fastqsolexa" label="Fastq file"/> 6 6 </inputs> 7 7 <outputs> … … 13 13 <test> 14 14 <param name="input1" value="1.fastq" ftype="fastq" /> 15 <output name="output1" file="convert_fastq2fasta_out2.fasta" /> 15 <output name="output1" file="fastq_to_fasta_qual_out2.fasta" /> 16 </test> 17 <test> 18 <param name="input1" value="1.fastqsolexa" ftype="fastq" /> 19 <output name="output1" file="fastq_to_fasta_qual_out4.fasta" /> 16 20 </test> 17 21 </tests> … … 20 24 **What it does** 21 25 22 This tool converts Solexa FASTQ data to FASTA format by generating 2 files, reads and quality scores.26 This tool extracts sequences and quality scores from FASTQ data ( both Sanger/Standard and Solexa variants ), producing a FASTA dataset and a QUAL dataset. 23 27 24 28 ----- … … 26 30 **Example1** 27 31 28 - Converting the following S olexafastq data::32 - Converting the following Sanger/Standard fastq data:: 29 33 30 34 @seq1 … … 51 55 40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15 52 56 53 54 57 **Example2** 55 58 56 59 - Converting the following Solexa fastq data:: 57 60 58 @ seq161 @HANNIBAL_1_FC302VTAAXX:2:1:228:167 59 62 GAATTGATCAGGACATAGGACAACTGTAGGCACCAT 60 + seq163 +HANNIBAL_1_FC302VTAAXX:2:1:228:167 61 64 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 62 @ seq265 @HANNIBAL_1_FC302VTAAXX:2:1:156:340 63 66 GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG 64 + seq267 +HANNIBAL_1_FC302VTAAXX:2:1:156:340 65 68 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 66 69 67 70 - will extract the following sequences:: 68 71 69 >seq172 >HANNIBAL_1_FC302VTAAXX:2:1:228:167 70 73 GAATTGATCAGGACATAGGACAACTGTAGGCACCAT 71 >seq274 >HANNIBAL_1_FC302VTAAXX:2:1:156:340 72 75 GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG 73 76 74 77 - and quality scores:: 75 78 76 >seq177 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 78 >seq279 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 79 >HANNIBAL_1_FC302VTAAXX:2:1:228:167 80 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 81 >HANNIBAL_1_FC302VTAAXX:2:1:156:340 82 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 80 83 81 84 </help> universe_wsgi.ini.sample
r1334 r1340 177 177 fasta = galaxy.datatypes.sequence:Fasta,display_in_upload 178 178 fastq = galaxy.datatypes.sequence:Fastq,display_in_upload 179 fastqsolexa = galaxy.datatypes.sequence:FastqSolexa,display_in_upload 179 180 gff = galaxy.datatypes.interval:Gff,display_in_upload 180 181 gff3 = galaxy.datatypes.interval:Gff3,display_in_upload … … 301 302 10 = galaxy.datatypes.sequence:Lav 302 303 15 = galaxy.datatypes.sequence:Fasta 303 20 = galaxy.datatypes.interval:Wiggle 304 25 = galaxy.datatypes.images:Html 305 30 = galaxy.datatypes.sequence:Axt 306 35 = galaxy.datatypes.interval:Bed 307 40 = galaxy.datatypes.interval:CustomTrack 308 45 = galaxy.datatypes.interval:Gff 309 50 = galaxy.datatypes.interval:Gff3 310 55 = galaxy.datatypes.interval:Interval 304 20 = galaxy.datatypes.sequence:Fastq 305 25 = galaxy.datatypes.sequence:FastqSolexa 306 30 = galaxy.datatypes.interval:Wiggle 307 35 = galaxy.datatypes.images:Html 308 40 = galaxy.datatypes.sequence:Axt 309 45 = galaxy.datatypes.interval:Bed 310 50 = galaxy.datatypes.interval:CustomTrack 311 55 = galaxy.datatypes.interval:Gff 312 60 = galaxy.datatypes.interval:Gff3 313 65 = galaxy.datatypes.interval:Interval