Changeset 624:4e49e75880be
- Timestamp:
- 09/27/07 10:10:16 (1 year ago)
- Files:
-
- lib/galaxy/app.py (modified) (1 diff)
- lib/galaxy/config.py (modified) (1 diff)
- lib/galaxy/datatypes/data.py (modified) (1 diff)
- lib/galaxy/datatypes/images.py (modified) (4 diffs)
- lib/galaxy/datatypes/interval.py (modified) (9 diffs)
- lib/galaxy/datatypes/registry.py (modified) (6 diffs)
- lib/galaxy/datatypes/sequence.py (modified) (5 diffs)
- lib/galaxy/datatypes/sniff.py (modified) (3 diffs)
- lib/galaxy/datatypes/tabular.py (modified) (5 diffs)
- lib/galaxy/datatypes/test/test.gff (modified) (1 diff)
- universe_wsgi.ini.sample (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
lib/galaxy/app.py
r506 r624 15 15 config.configure_logging( self.config ) 16 16 #Set up datatypes registry 17 self.datatypes_registry = galaxy.datatypes.registry.Registry(datatypes = self.config.datatypes)17 self.datatypes_registry = galaxy.datatypes.registry.Registry(datatypes=self.config.datatypes, sniff_order=self.config.sniff_order) 18 18 galaxy.model.set_datatypes_registry(self.datatypes_registry) 19 19 # Connect up the object model lib/galaxy/config.py
r540 r624 61 61 except ConfigParser.NoSectionError: 62 62 self.datatypes = [] 63 #Store sniff order config 64 try: 65 self.sniff_order = global_conf_parser.items("galaxy:sniff_order") 66 except ConfigParser.NoSectionError: 67 self.sniff_order = [] 63 68 self.datatype_converters_config = kwargs.get( 'datatype_converters_config_file', "datatype_converters_conf.xml" ) 64 69 self.datatype_converters_path = kwargs.get( 'datatype_converters_path', os.path.join(self.root,"lib/galaxy/datatypes/converters") ) lib/galaxy/datatypes/data.py
r550 r624 3 3 from cgi import escape 4 4 from galaxy.datatypes.metadata import * 5 log = logging.getLogger(__name__)6 5 from galaxy.datatypes.metadata import MetadataElement 7 6 from galaxy.datatypes import metadata 7 8 log = logging.getLogger(__name__) 9 10 # Valid strand column values 11 valid_strand = ['+', '-', '.'] 8 12 9 13 # Constants for data states lib/galaxy/datatypes/images.py
r479 r624 5 5 import data 6 6 import logging 7 from galaxy.datatypes.sniff import * 7 8 from urllib import urlencode 8 9 … … 18 19 class Gmaj( data.Data ): 19 20 """Class describing a GMAJ Applet""" 21 file_ext = "gmaj.zip" 22 20 23 def set_peek( self, dataset ): 21 24 dataset.peek = "<p align=\"center\"><applet code=\"edu.psu.bx.gmaj.MajApplet.class\" archive=\"/static/gmaj/gmaj.jar\" width=\"200\" height=\"30\" align=\"middle\"> <param name=bundle value=\"display?id="+str(dataset.id)+"&tofile=yes&toext=.zip\"> <param name=buttonlabel value=\"Launch GMAJ\"><param name=nobutton value=\"false\"><param name=urlpause value=\"100\"><param name=debug value=\"false\"><i>Your browser is not responding to the <applet> tag.</i></applet></p>" … … 30 33 """Returns the mime type of the datatype""" 31 34 return 'application/zip' 35 def sniff( self, filename ): 36 #TODO: fix me 37 return '' 32 38 33 34 class Laj( data.Text ):35 """Class describing a LAJ Applet"""36 def set_peek( self, dataset ):37 export_url = "/history_add_to?"+urlencode({'history_id':dataset.history_id,'ext':'lav','name':'LAJ Output','info':'Added by LAJ','dbkey':dataset.dbkey})38 dataset.peek = "<p align=\"center\"><applet code=\"edu.psu.cse.bio.laj.LajApplet.class\" archive=\"static/laj/laj.jar\" width=\"200\" height=\"30\"><param name=buttonlabel value=\"Launch LAJ\"><param name=title value=\"LAJ in Galaxy\"><param name=posturl value=\""+export_url+"\"><param name=alignfile1 value=\"display?id="+str(dataset.id)+"\"><param name=noseq value=\"true\"></applet></p>"39 dataset.blurb = 'LAJ Multiple Alignment Viewer'40 41 def display_peek(self, dataset):42 try:43 return dataset.peek44 except:45 return "peek unavailable"46 47 39 class Html( data.Text ): 48 40 """Class describing an html file""" 41 file_ext = "html" 42 49 43 def set_peek( self, dataset ): 50 44 dataset.peek = "HTML file (%s)" % ( data.nice_size( dataset.get_size() ) ) … … 54 48 """Returns the mime type of the datatype""" 55 49 return 'text/html' 50 51 def sniff( self, filename ): 52 """ 53 Determines wether the file is in html format 54 55 >>> fname = get_test_fname( 'complete.bed' ) 56 >>> Html().sniff( fname ) 57 '' 58 >>> fname = get_test_fname( 'file.html' ) 59 >>> Html().sniff( fname ) 60 'html' 61 """ 62 headers = get_headers( filename, None ) 63 64 try: 65 for i, hdr in enumerate(headers): 66 if hdr and hdr[0].lower().find( '<html>' ) >=0: 67 return self.file_ext 68 return '' 69 except: 70 return '' 71 72 class Laj( data.Text ): 73 """Class describing a LAJ Applet""" 74 file_ext = "laj" 75 76 def set_peek( self, dataset ): 77 export_url = "/history_add_to?"+urlencode({'history_id':dataset.history_id,'ext':'lav','name':'LAJ Output','info':'Added by LAJ','dbkey':dataset.dbkey}) 78 dataset.peek = "<p align=\"center\"><applet code=\"edu.psu.cse.bio.laj.LajApplet.class\" archive=\"static/laj/laj.jar\" width=\"200\" height=\"30\"><param name=buttonlabel value=\"Launch LAJ\"><param name=title value=\"LAJ in Galaxy\"><param name=posturl value=\""+export_url+"\"><param name=alignfile1 value=\"display?id="+str(dataset.id)+"\"><param name=noseq value=\"true\"></applet></p>" 79 dataset.blurb = 'LAJ Multiple Alignment Viewer' 80 def display_peek(self, dataset): 81 try: 82 return dataset.peek 83 except: 84 return "peek unavailable" 85 def sniff( self, filename ): 86 #TODO: fix me... 87 return '' 88 lib/galaxy/datatypes/interval.py
r609 r624 9 9 import data 10 10 from galaxy import util 11 from galaxy.datatypes.sniff import * 11 12 from cgi import escape 12 13 import urllib … … 37 38 class Interval( Tabular ): 38 39 """Tab delimited data containing interval information""" 40 file_ext = "interval" 39 41 40 42 """Add metadata elements""" … … 44 46 MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) 45 47 MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) 46 47 48 48 49 def __init__(self, **kwd): … … 180 181 return [("lines","Remove erroneous lines")] 181 182 183 def sniff( self, filename ): 184 """ 185 Checks for 'intervalness' 186 187 This format is mostly used by galaxy itself. Valid interval files should include 188 a valid header comment, but this seems to be loosely regulated. 189 190 >>> fname = get_test_fname( 'test_space.bed' ) 191 >>> Interval().sniff( fname ) 192 '' 193 >>> fname = get_test_fname( 'interval.interval' ) 194 >>> Interval().sniff( fname ) 195 'interval' 196 """ 197 headers = get_headers( filename, '\t' ) 198 try: 199 """ 200 If we got here, we already know the file is_column_based and is not bed, 201 so we'll just look for some valid data. 202 """ 203 for hdr in headers: 204 if not (hdr[0] == '' or hdr[0].startswith( '#' )): 205 if len(hdr) < 3: 206 return '' 207 try: 208 map( int, [hdr[1], hdr[2]] ) 209 except: 210 return '' 211 return self.file_ext 212 except: 213 return '' 214 182 215 class Bed( Interval ): 183 216 """Tab delimited data in BED format""" 217 file_ext = "bed" 184 218 185 219 """Add metadata elements""" … … 265 299 except: return "This item contains no content" 266 300 301 def sniff( self, filename ): 302 """ 303 Checks for 'bedness' 304 305 BED lines have three required fields and nine additional optional fields. 306 The number of fields per line must be consistent throughout any single set of data in 307 an annotation track. 308 309 For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format1 310 311 >>> fname = get_test_fname( 'test_tab.bed' ) 312 >>> Bed().sniff( fname ) 313 'bed' 314 >>> fname = get_test_fname( 'interval.bed' ) 315 >>> Bed().sniff( fname ) 316 '' 317 >>> fname = get_test_fname( 'complete.bed' ) 318 >>> Bed().sniff( fname ) 319 'bed' 320 """ 321 col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho'] 322 headers = get_headers( filename, '\t' ) 323 try: 324 if not headers: 325 return '' 326 for hdr in headers: 327 valid_col1 = False 328 if len(hdr) < 3 or len(hdr) > 12: 329 return '' 330 for str in col1_startswith: 331 if hdr[0].lower().startswith(str): 332 valid_col1 = True 333 break 334 if valid_col1: 335 try: 336 map( int, [hdr[1], hdr[2]] ) 337 except: 338 return '' 339 if len(hdr) > 3: 340 """ 341 Since all 9 of these fields are optional, it is difficult to test 342 for specific column values... 343 """ 344 optionals = hdr[3:] 345 """ 346 ...we can, however, test complete BED definitions fairly easily. 347 """ 348 if len(optionals) == 9: 349 try: 350 map ( int, [optionals[1], optionals[3], optionals[4], optionals[5], optionals[6]] ) 351 except: 352 return '' 353 score = int(optionals[1]) 354 if score < 0 or score > 1000: 355 return '' 356 if optionals[2] not in ['+', '-']: 357 return '' 358 if int(optionals[5]) != 0: 359 return '' 360 block_count = int(optionals[6]) 361 """ 362 Sometimes the blosck_sizes and block_starts lists end in extra commas 363 """ 364 block_sizes = optionals[7].rstrip(',').split(',') 365 block_starts = optionals[8].rstrip(',').split(',') 366 if len(block_sizes) != block_count or len(block_starts) != block_count: 367 return '' 368 elif len(optionals) > 4 and len(optionals) < 9: 369 """ 370 Here it gets a bit trickier, but in this case, we can be somewhat confident 371 that optionals will include a strand column 372 """ 373 is_valid_strand = False 374 for ele in optionals: 375 if ele in data.valid_strand: 376 is_valid_strand = True 377 if not is_valid_strand: 378 return '' 379 else: 380 return '' 381 return self.file_ext 382 except: 383 return '' 384 267 385 class Gff( Tabular ): 268 386 """Tab delimited data in Gff format""" 387 file_ext = "gff" 269 388 270 389 """Add metadata elements""" … … 331 450 return ret_val 332 451 452 def sniff( self, filename ): 453 """ 454 Determines whether the file is in gff format 455 456 GFF lines have nine required fields that must be tab-separated. 457 458 For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3 459 460 >>> fname = get_test_fname( 'gff_version_3.gff' ) 461 >>> Gff().sniff( fname ) 462 '' 463 >>> fname = get_test_fname( 'test.gff' ) 464 >>> Gff().sniff( fname ) 465 'gff' 466 """ 467 headers = get_headers( filename, '\t' ) 468 try: 469 if len(headers) < 2: 470 return '' 471 for hdr in headers: 472 if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0: 473 return '' 474 if hdr and hdr[0] and not hdr[0].startswith( '#' ): 475 if len(hdr) != 9: 476 return '' 477 try: 478 map( int, [hdr[3], hdr[4]] ) 479 except: 480 return '' 481 if hdr[5] != '.': 482 try: 483 score = int(hdr[5]) 484 except: 485 return '' 486 if (score < 0 or score > 1000): 487 return '' 488 if hdr[6] not in data.valid_strand: 489 return '' 490 return self.file_ext 491 except: 492 return '' 493 333 494 class Gff3( Gff ): 334 495 """Tab delimited data in Gff3 format""" 496 file_ext = "gff3" 335 497 336 498 """Add metadata elements""" … … 341 503 Gff.__init__(self, **kwd) 342 504 505 def sniff( self, filename ): 506 """ 507 Determines whether the file is in gff version 3 format 508 509 GFF 3 format: 510 511 1) adds a mechanism for representing more than one level 512 of hierarchical grouping of features and subfeatures. 513 2) separates the ideas of group membership and feature name/id 514 3) constrains the feature type field to be taken from a controlled 515 vocabulary. 516 4) allows a single feature, such as an exon, to belong to more than 517 one group at a time. 518 5) provides an explicit convention for pairwise alignments 519 6) provides an explicit convention for features that occupy disjunct regions 520 521 The format consists of 9 columns, separated by tabs (NOT spaces). 522 523 Undefined fields are replaced with the "." character, as described in the original GFF spec. 524 525 For complete details see http://song.sourceforge.net/gff3.shtml 526 527 >>> fname = get_test_fname( 'test.gff' ) 528 >>> Gff3().sniff( fname ) 529 '' 530 >>> fname = get_test_fname('gff_version_3.gff') 531 >>> Gff3().sniff( fname ) 532 'gff3' 533 """ 534 valid_gff3_strand = ['+', '-', '.', '?'] 535 valid_gff3_phase = ['.', '0', '1', '2'] 536 headers = get_headers( filename, '\t' ) 537 try: 538 if len(headers) < 2: 539 return '' 540 for hdr in headers: 541 if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) < 0: 542 return '' 543 if hdr and hdr[0] and not hdr[0].startswith( '#' ): 544 if len(hdr) != 9: 545 return '' 546 try: 547 map( int, [hdr[3]] ) 548 except: 549 if hdr[3] != '.': 550 return '' 551 try: 552 map( int, [hdr[4]] ) 553 except: 554 if hdr[4] != '.': 555 return '' 556 if hdr[5] != '.': 557 try: 558 score = int(hdr[5]) 559 except: 560 return '' 561 if (score < 0 or score > 1000): 562 return '' 563 if hdr[6] not in valid_gff3_strand: 564 return '' 565 if hdr[7] not in valid_gff3_phase: 566 return '' 567 return self.file_ext 568 except: 569 return '' 570 343 571 class Wiggle( Tabular ): 344 572 """Tab delimited data in wiggle format""" 573 file_ext = "wig" 574 345 575 MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True ) 346 576 347 577 def make_html_table(self, data): 348 578 return Tabular.make_html_table(self, data, skipchar='#') 349 350 #Extend Tabular type, since interval tools will fail on track def line (we should fix this) 351 #This is a skeleton class for now, allows viewing at ucsc and formatted peeking. 579 580 def sniff( self, filename ): 581 """ 582 Determines wether the file is in wiggle format 583 584 The .wig format is line-oriented. Wiggle data is preceeded by a track definition line, 585 which adds a number of options for controlling the default display of this track. 586 Following the track definition line is the track data, which can be entered in several 587 different formats. 588 589 The track definition line begins with the word 'track' followed by the track type. 590 The track type with version is REQUIRED, and it currently must be wiggle_0. For example, 591 track type=wiggle_0... 592 593 For complete details see http://genome.ucsc.edu/goldenPath/help/wiggle.html 594 595 >>> fname = get_test_fname( 'interval.bed' ) 596 >>> Wiggle().sniff( fname ) 597 '' 598 >>> fname = get_test_fname( 'wiggle.wig' ) 599 >>> Wiggle().sniff( fname ) 600 'wig' 601 """ 602 headers = get_headers( filename, None ) 603 try: 604 for hdr in headers: 605 if len(hdr) > 1 and hdr[0] == 'track' and hdr[1].startswith('type=wiggle'): 606 return self.file_ext 607 return '' 608 except: 609 return '' 610 352 611 class CustomTrack ( Tabular ): 353 612 """UCSC CustomTrack""" 354 613 file_ext = "customtrack" 614 355 615 def __init__(self, **kwd): 356 616 """Initialize interval datatype, by adding UCSC display app""" … … 395 655 return ret_val 396 656 397 #Extend Tabular type, since interval tools will fail on track def line (we should fix this) 398 #This is a skeleton class for now, allows viewing at GBrowse and formatted peeking. 657 def sniff( self, filename ): 658 """ 659 Determines whether the file is in customtrack format. 660 661 CustomTrack files are built within Galaxy and are basically bed or interval files with the first line looking 662 something like this. 663 664 track name="User Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1 665 666 >>> fname = get_test_fname( 'complete.bed' ) 667 >>> CustomTrack().sniff( fname ) 668 '' 669 >>> fname = get_test_fname( 'ucsc.customtrack' ) 670 >>> CustomTrack().sniff( fname ) 671 'customtrack' 672 """ 673 headers = get_headers( filename, None ) 674 first_line = True 675 for hdr in headers: 676 if first_line: 677 try: 678 if hdr[0].startswith('track'): 679 first_line = False 680 else: 681 return '' 682 except: 683 return '' 684 else: 685 try: 686 if not (hdr[0] == '' or hdr[0].startswith( '#' )): 687 if len(hdr) < 3: 688 return '' 689 try: 690 map( int, [hdr[1], hdr[2]] ) 691 except: 692 return '' 693 except: 694 return '' 695 return self.file_ext 696 399 697 class GBrowseTrack ( Tabular ): 698 """GMOD GBrowseTrack""" 699 file_ext = "gbrowsetrack" 400 700 401 701 def __init__(self, **kwd): … … 432 732 return open(dataset.file_name) 433 733 734 def sniff( self, filename ): 735 """ 736 Determines whether the file is in gbrowsetrack format. 737 738 GBrowseTrack files are built within Galaxy. 739 TODO: Not yet sure what this file will look like. Fix this sniffer and add some unit tests here as soon as we know. 740 """ 741 return '' 742 434 743 if __name__ == '__main__': 435 744 import doctest, sys lib/galaxy/datatypes/registry.py
r604 r624 9 9 10 10 class Registry( object ): 11 def __init__( self, datatypes =[] ):11 def __init__( self, datatypes=[], sniff_order=[] ): 12 12 self.log = logging.getLogger(__name__) 13 13 self.datatypes_by_extension = {} 14 14 self.mimetypes_by_extension = {} 15 15 self.datatype_converters = odict() 16 self.sniff_order = [] 16 17 for ext, kind in datatypes: 17 18 try: … … 39 40 'data' : data.Data(), 40 41 'bed' : interval.Bed(), 41 'txt' : data.Text(), 42 'text' : data.Text(), 42 'txt' : data.Text(), 43 43 'interval' : interval.Interval(), 44 44 'tabular' : tabular.Tabular(), … … 55 55 'lav' : sequence.Lav(), 56 56 'html' : images.Html(), 57 'customtrack' : interval.CustomTrack(), 58 'gbrowsetrack' : interval.GBrowseTrack() 57 'customtrack' : interval.CustomTrack() 59 58 } 60 59 self.mimetypes_by_extension = { 61 60 'data' : 'application/octet-stream', 62 61 'bed' : 'text/plain', 63 'txt' : 'text/plain', 64 'text' : 'text/plain', 62 'txt' : 'text/plain', 65 63 'interval' : 'text/plain', 66 64 'tabular' : 'text/plain', … … 77 75 'lav' : 'text/plain', 78 76 'html' : 'text/html', 79 'customtrack' : 'text/plain', 80 'gbrowsetrack' : 'text/plain' 77 'customtrack' : 'text/plain' 81 78 } 79 """ 80 The order in which we attempt to determine data types is critical 81 because some formats are much more flexibly defined than others. 82 """ 83 sniff_order.sort() 84 for ord, kind in sniff_order: 85 try: 86 fields = kind.split(":") 87 datatype_module = fields[0] 88 datatype_class = fields[1] 89 fields = datatype_module.split(".") 90 module = __import__(fields.pop(0)) 91 for mod in fields: module = getattr(module,mod) 92 aclass = getattr(module, datatype_class)() 93 included = False 94 for atype in self.sniff_order: 95 if isinstance(atype, aclass.__class__): 96 included = True 97 break 98 if not included: 99 self.sniff_order.append(aclass) 100 except Exception, exc: 101 self.log.warning('error appending datatype: %s to sniff_order, error: %s' %(str(kind), str(exc))) 102 #default values 103 if len(self.sniff_order) < 1: 104 self.sniff_order = [ 105 images.Gmaj(), 106 images.Laj(), 107 sequence.Maf(), 108 sequence.Lav(), 109 sequence.Fasta(), 110 interval.Wiggle(), 111 images.Html(), 112 sequence.Axt(), 113 interval.Bed(), 114 interval.CustomTrack(), 115 interval.Gff(), 116 interval.Gff3(), 117 interval.Interval() 118 ] 119 def append_to_sniff_order(): 120 """Just in case any supported data types are not included in the config's sniff_order section.""" 121 for ext in self.datatypes_by_extension: 122 datatype = self.datatypes_by_extension[ext] 123 included = False 124 for atype in self.sniff_order: 125 if isinstance(atype, datatype.__class__): 126 included = True 127 break 128 if not included: 129 self.sniff_order.append(datatype) 130 append_to_sniff_order() 82 131 83 132 def get_mimetype_by_extension(self, ext ): 84 """ 85 Returns a mimetype based on an extension 86 """ 133 """Returns a mimetype based on an extension""" 87 134 try: 88 135 mimetype = self.mimetypes_by_extension[ext] … … 94 141 95 142 def get_datatype_by_extension(self, ext ): 96 """ 97 Returns a datatype based on an extension 98 """ 143 """Returns a datatype based on an extension""" 99 144 try: 100 145 builder = self.datatypes_by_extension[ext] … … 115 160 116 161 def old_change_datatype(self, data, ext): 117 """ 118 Creates and returns a new datatype based on an existing data and an extension 119 """ 162 """Creates and returns a new datatype based on an existing data and an extension""" 120 163 newdata = factory(ext)(id=data.id) 121 164 for key, value in data.__dict__.items(): lib/galaxy/datatypes/sequence.py
r608 r624 8 8 from galaxy.datatypes import metadata 9 9 from galaxy import util 10 from sniff import * 10 11 11 12 log = logging.getLogger(__name__) … … 23 24 class Fasta( Sequence ): 24 25 """Class representing a FASTA sequence""" 26 file_ext = "fasta" 25 27 26 28 def set_peek( self, dataset ): … … 38 40 dataset.blurb = '%d sequences' % count 39 41 42 def sniff(self, filename): 43 """ 44 Determines whether the file is in fasta format 45 46 A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. 47 The first character of the description line is a greater-than (">") symbol in the first column. 48 All lines should be shorter than 80 charcters 49 50 For complete details see http://www.g2l.bio.uni-goettingen.de/blast/fastades.html 51 52 >>> fname = get_test_fname( 'sequence.maf' ) 53 >>> Fasta().sniff( fname ) 54 '' 55 >>> fname = get_test_fname( 'sequence.fasta' ) 56 >>> Fasta().sniff( fname ) 57 'fasta' 58 """ 59 headers = get_headers( filename, None ) 60 try: 61 if len(headers) > 1 and headers[0][0] and headers[0][0][0] == ">": 62 return self.file_ext 63 else: 64 return '' 65 except: 66 return '' 67 40 68 try: 41 69 import pkg_resources; pkg_resources.require( "bx-python" ) … … 43 71 except: 44 72 pass 73 45 74 class Maf( Alignment ): 46 75 """Class describing a Maf alignment""" 47 76 file_ext = "maf" 77 48 78 def init_meta( self, dataset, copy_from=None ): 49 79 Alignment.init_meta( self, dataset, copy_from=copy_from ) … … 78 108 return False 79 109 110 def sniff( self, filename ): 111 """ 112 Determines wether the file is in maf format 113 114 The .maf format is line-oriented. Each multiple alignment ends with a blank line. 115 Each sequence in an alignment is on a single line, which can get quite long, but 116 there is no length limit. Words in a line are delimited by any white space. 117 Lines starting with # are considered to be comments. Lines starting with ## can 118 be ignored by most programs, but contain meta-data of one form or another. 119 120 The first line of a .maf file begins with ##maf. This word is followed by white-space-separated 121 variable=value pairs. There should be no white space surrounding the "=". 122 123 For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5 124 125 >>> fname = get_test_fname( 'sequence.maf' ) 126 >>> Maf().sniff( fname ) 127 'maf' 128 >>> fname = get_test_fname( 'sequence.fasta' ) 129 >>> Maf().sniff( fname ) 130 '' 131 """ 132 headers = get_headers( filename, None ) 133 try: 134 if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf": 135 return self.file_ext 136 else: 137 return '' 138 except: 139 return '' 140 80 141 class Axt( Alignment ): 81 142 """Class describing an axt alignment""" 143 file_ext = "axt" 144 145 def sniff( self, filename ): 146 """ 147 Determines whether the file is in axt format 148 149 axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab 150 at Penn State University. Each alignment block in an axt file contains three lines: a summary 151 line and 2 sequence lines. Blocks are separated from one another by blank lines. 152 153 The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields: 154 155 For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html 156 157 >>> fname = get_test_fname( 'alignment.axt' ) 158 >>> Axt().sniff( fname ) 159 'axt' 160 >>> fname = get_test_fname( 'alignment.lav' ) 161 >>> Axt().sniff( fname ) 162 '' 163 """ 164 headers = get_headers( filename, None ) 165 if len(headers) < 4: 166 return '' 167 try: 168 """Assume the summary line is the first line of the file.""" 169 line = headers[0] 170 except: 171 return '' 172 173 if len(line) != 9: 174 return '' 175 try: 176 map ( int, [line[0], line[2], line[3], line[5], line[6], line[8]] ) 177 except: 178 return '' 179 if line[7] not in data.valid_strand: 180 return '' 181 return self.file_ext 82 182 83 183 class Lav( Alignment ): 84 184 """Class describing a LAV alignment""" 85 185 file_ext = "lav" 186 187 def sniff( self, filename ): 188 """ 189 Determines whether the file is in lav format 190 191 LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. 192 The first line of a .lav file begins with #:lav. 193 194 For complete details see http://www.bioperl.org/wiki/LAV_alignment_format 195 196 >>> fname = get_test_fname( 'alignment.lav' ) 197 >>> Lav().sniff( fname ) 198 'lav' 199 >>> fname = get_test_fname( 'alignment.axt' ) 200 >>> Lav().sniff( fname ) 201 '' 202 """ 203 headers = get_headers( filename, None ) 204 try: 205 if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'): 206 return self.file_ext 207 else: 208 return '' 209 except: 210 return '' 211 212 lib/galaxy/datatypes/sniff.py
r604 r624 3 3 """ 4 4 import logging, sys, os, csv, tempfile, shutil, re 5 import registry 5 6 6 7 log = logging.getLogger(__name__) 7 8 valid_strand = ['+', '-', '.']9 valid_gff3_strand = ['+', '-', '.', '?']10 valid_gff3_phase = ['.', '0', '1', '2']11 8 12 9 def get_test_fname(fname): … … 116 113 if not headers: 117 114 return False 118 119 115 for hdr in headers[skip:]: 120 116 if len(hdr) > 1 and hdr[0] and not hdr[0].startswith('#'): 121 117 count = len(hdr) 122 118 break 123 124 119 if count < 2: 125 120 return False 126 127 121 for hdr in headers[skip:]: 128 122 if len(hdr) > 1 and hdr[0] and not hdr[0].startswith('#') and len(hdr) != count: 129 123 return False 130 124 return True 131 132 def is_fasta(headers):133 """134 Determines wether the file is in fasta format135 136 A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.137 The first character of the description line is a greater-than (">") symbol in the first column.138 All lines should be shorter than 80 charcters139 140 For complete details see http://www.g2l.bio.uni-goettingen.de/blast/fastades.html141 142 >>> headers = get_headers(__file__, ' ')143 >>> is_fasta(headers)144 False145 >>> fname = get_test_fname('sequence.fasta')146 >>> headers = get_headers(fname,' ')147 >>> is_fasta(headers)148 True149 """150 try:151 return len(headers) > 1 and headers[0][0] and headers[0][0][0] == ">"152 except:153 return False154 125 155 def is_gff(headers): 156 """ 157 Determines wether the file is in gff format 158 159 GFF lines have nine required fields that must be tab-separated. 160 161 For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3 162 163 >>> headers = get_headers(__file__,' ') 164 >>> is_fasta(headers) 165 False 166 >>> fname = get_test_fname('test.gff') 167 >>> headers = get_headers(fname,'\\t') 168 >>> is_gff(headers) 169 True 170 """ 171 try: 172 if len(headers) < 2: 173 return False 174 for hdr in headers: 175 if len( hdr ) > 1 and hdr[0] and not hdr[0].startswith( '#' ): 176 if len(hdr) != 9: 177 return False 178 try: 179 map( int, [hdr[3], hdr[4]] ) 180