Changeset 1586:4e44d29377e3

Show
Ignore:
Timestamp:
10/30/08 14:46:57 (2 months ago)
Author:
Dan Blankenberg <dan@bx.psu.edu>
branch:
default
Message:

Annotation profiler: Better handling of the case where regions extend beyond known chromosome boundaries.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • tools/annotation_profiler/annotation_profiler_for_interval.py

    r1387 r1586  
    123123        self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique) 
    124124        self.interval_table_overlap_count = {} #total number of user input intervals which overlap table 
     125        self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome 
    125126    def add_region( self, chrom, start, end ): 
    126         self.total_interval_size += ( end - start ) 
     127        chrom_length = self.chrom_lengths.get( chrom ) 
     128        region_start = min( start, chrom_length ) 
     129        region_end = min( end, chrom_length ) 
     130        region_length = region_end - region_start 
     131         
     132        if region_length < 1 or region_start != start or region_end != end: 
     133            if chrom not in self.region_size_errors: 
     134                self.region_size_errors[chrom] = [] 
     135            self.region_size_errors[chrom].append( ( start, end ) ) 
     136            if region_length < 1: return 
     137         
     138        self.total_interval_size += region_length 
    127139        self.total_interval_count += 1 
    128140        if chrom not in self.chromosome_coverage: 
    129             self.chromosome_coverage[chrom] = bx.bitset.BitSet( self.chrom_lengths.get( chrom ) ) 
    130         self.chromosome_coverage[chrom].set_range( start, end - start ) 
    131         for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start, end ): 
     141            self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length ) 
     142         
     143        self.chromosome_coverage[chrom].set_range( region_start, region_length ) 
     144        for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ): 
    132145            if table_name not in self.table_coverage: 
    133146                self.table_coverage[table_name] = 0 
     
    215228            out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) ) 
    216229    out.close() 
     230     
     231    #report chrom size errors as needed: 
     232    if table_coverage_summary.region_size_errors: 
     233        print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:" 
     234        for chrom, regions in table_coverage_summary.region_size_errors.items(): 
     235            if len( regions ) > 3: 
     236                extra_region_info = ", ... " 
     237            else: 
     238                extra_region_info = "" 
     239            print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info ) 
    217240 
    218241class ChromosomeLengths: