root/cron/build_chrom_db.py

Revision 0:ae898a17920b, 2.2 kB (checked in by James Taylor <james@bx.psu.edu>, 2 years ago)

Moving james-wsgi branch to new trunk.

Line 
1 #!/usr/bin/env python
2
3 """
4 Connects to a UCSC table browser and scrapes chrominfo for every build
5 specified by an input file (such as one output by parse_builds.py).
6 If not input file specified, it will connect using parse_builds.py to
7 retrieve a list of available builds.
8
9 All chromInfo is placed in a path with the convention
10 {dbpath}/buildname.len
11
12 Usage:
13 python build_chrom_db.py dbpath/ [builds_file]
14 """
15
16 import sys
17 import parse_builds
18 import urllib
19 import fileinput
20
21 def getchrominfo(url, db):
22     tableURL = "http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?"
23     URL = tableURL + urllib.urlencode({
24         "clade" : "",
25         "org" : "",
26         "db" : db,
27         "hgta_outputType": "primaryTable",
28         "hgta_group" : "allTables",
29         "hgta_table" : "chromInfo",
30         "hgta_track" : db,
31         "hgta_regionType":"",
32         "position":"",
33         "hgta_doTopSubmit" : "get info"})
34     page = urllib.urlopen(URL)
35     for line in page:
36         line = line.rstrip( "\r\n" )
37         if line.startswith("#"): continue
38         fields = line.split("\t")
39         if len(fields) > 1:
40             yield [fields[0], fields[1]]
41
42 if __name__ == "__main__":
43     if len(sys.argv) == 1:
44         print "Path to place chromInfo tables must be specified."
45         sys.exit(1)
46     dbpath = sys.argv[1]
47     builds = []
48     if len(sys.argv) > 2:
49         try:
50             buildfile = fileinput.FileInput(sys.argv[2])
51             for line in buildfile:
52                 if line.startswith("#"): continue
53                 builds.append(line.split("\t")[0])
54         except:
55             print "Bad input file."
56             sys.exit(1)
57     else:
58         try:
59             for build in parse_builds.getbuilds("http://genome-test.cse.ucsc.edu/cgi-bin/das/dsn"):
60                 builds.append(build[0])
61         except:
62             print "Unable to retrieve builds."
63             sys.exit(1)
64     for build in builds:
65         if build == "?":continue # no lengths for unspecified chrom
66         outfile = open(dbpath + build + ".len", "w")
67         print "Retrieving "+build
68         for chrominfo in getchrominfo("http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?",build):
69             print >> outfile,"\t".join(chrominfo)
70         outfile.close()
Note: See TracBrowser for help on using the browser.