| 1 |
#!/usr/bin/env python |
|---|
| 2 |
|
|---|
| 3 |
""" |
|---|
| 4 |
Connects to a UCSC table browser and scrapes chrominfo for every build |
|---|
| 5 |
specified by an input file (such as one output by parse_builds.py). |
|---|
| 6 |
If not input file specified, it will connect using parse_builds.py to |
|---|
| 7 |
retrieve a list of available builds. |
|---|
| 8 |
|
|---|
| 9 |
All chromInfo is placed in a path with the convention |
|---|
| 10 |
{dbpath}/buildname.len |
|---|
| 11 |
|
|---|
| 12 |
Usage: |
|---|
| 13 |
python build_chrom_db.py dbpath/ [builds_file] |
|---|
| 14 |
""" |
|---|
| 15 |
|
|---|
| 16 |
import sys |
|---|
| 17 |
import parse_builds |
|---|
| 18 |
import urllib |
|---|
| 19 |
import fileinput |
|---|
| 20 |
|
|---|
| 21 |
def getchrominfo(url, db): |
|---|
| 22 |
tableURL = "http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?" |
|---|
| 23 |
URL = tableURL + urllib.urlencode({ |
|---|
| 24 |
"clade" : "", |
|---|
| 25 |
"org" : "", |
|---|
| 26 |
"db" : db, |
|---|
| 27 |
"hgta_outputType": "primaryTable", |
|---|
| 28 |
"hgta_group" : "allTables", |
|---|
| 29 |
"hgta_table" : "chromInfo", |
|---|
| 30 |
"hgta_track" : db, |
|---|
| 31 |
"hgta_regionType":"", |
|---|
| 32 |
"position":"", |
|---|
| 33 |
"hgta_doTopSubmit" : "get info"}) |
|---|
| 34 |
page = urllib.urlopen(URL) |
|---|
| 35 |
for line in page: |
|---|
| 36 |
line = line.rstrip( "\r\n" ) |
|---|
| 37 |
if line.startswith("#"): continue |
|---|
| 38 |
fields = line.split("\t") |
|---|
| 39 |
if len(fields) > 1: |
|---|
| 40 |
yield [fields[0], fields[1]] |
|---|
| 41 |
|
|---|
| 42 |
if __name__ == "__main__": |
|---|
| 43 |
if len(sys.argv) == 1: |
|---|
| 44 |
print "Path to place chromInfo tables must be specified." |
|---|
| 45 |
sys.exit(1) |
|---|
| 46 |
dbpath = sys.argv[1] |
|---|
| 47 |
builds = [] |
|---|
| 48 |
if len(sys.argv) > 2: |
|---|
| 49 |
try: |
|---|
| 50 |
buildfile = fileinput.FileInput(sys.argv[2]) |
|---|
| 51 |
for line in buildfile: |
|---|
| 52 |
if line.startswith("#"): continue |
|---|
| 53 |
builds.append(line.split("\t")[0]) |
|---|
| 54 |
except: |
|---|
| 55 |
print "Bad input file." |
|---|
| 56 |
sys.exit(1) |
|---|
| 57 |
else: |
|---|
| 58 |
try: |
|---|
| 59 |
for build in parse_builds.getbuilds("http://genome-test.cse.ucsc.edu/cgi-bin/das/dsn"): |
|---|
| 60 |
builds.append(build[0]) |
|---|
| 61 |
except: |
|---|
| 62 |
print "Unable to retrieve builds." |
|---|
| 63 |
sys.exit(1) |
|---|
| 64 |
for build in builds: |
|---|
| 65 |
if build == "?":continue # no lengths for unspecified chrom |
|---|
| 66 |
outfile = open(dbpath + build + ".len", "w") |
|---|
| 67 |
print "Retrieving "+build |
|---|
| 68 |
for chrominfo in getchrominfo("http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?",build): |
|---|
| 69 |
print >> outfile,"\t".join(chrominfo) |
|---|
| 70 |
outfile.close() |
|---|