[2] | 1 | #!/usr/bin/env python
|
---|
| 2 | #Dan Blankenberg
|
---|
| 3 |
|
---|
| 4 | #Harvest Bacteria
|
---|
| 5 | #Connects to NCBI's Microbial Genome Projects website and scrapes it for information.
|
---|
| 6 | #Downloads and converts annotations for each Genome
|
---|
| 7 |
|
---|
| 8 | import sys, os, time
|
---|
| 9 | from urllib2 import urlopen
|
---|
| 10 | from urllib import urlretrieve
|
---|
| 11 | from ftplib import FTP
|
---|
| 12 | from BeautifulSoup import BeautifulSoup
|
---|
| 13 | from util import get_bed_from_genbank, get_bed_from_glimmer3, get_bed_from_GeneMarkHMM, get_bed_from_GeneMark
|
---|
| 14 |
|
---|
| 15 | assert sys.version_info[:2] >= ( 2, 4 )
|
---|
| 16 |
|
---|
| 17 | #this defines the types of ftp files we are interested in, and how to process/convert them to a form for our use
|
---|
| 18 | desired_ftp_files = {'GeneMark':{'ext':'GeneMark-2.5f','parser':'process_GeneMark'},
|
---|
| 19 | 'GeneMarkHMM':{'ext':'GeneMarkHMM-2.6m','parser':'process_GeneMarkHMM'},
|
---|
| 20 | 'Glimmer3':{'ext':'Glimmer3','parser':'process_Glimmer3'},
|
---|
| 21 | 'fna':{'ext':'fna','parser':'process_FASTA'},
|
---|
| 22 | 'gbk':{'ext':'gbk','parser':'process_Genbank'} }
|
---|
| 23 |
|
---|
| 24 |
|
---|
| 25 |
|
---|
| 26 | #number, name, chroms, kingdom, group, genbank, refseq, info_url, ftp_url
|
---|
| 27 | def iter_genome_projects( url = "http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1", info_url_base = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ):
|
---|
| 28 | for row in BeautifulSoup( urlopen( url ) ).findAll( name = 'tr', bgcolor = ["#EEFFDD", "#E8E8DD"] ):
|
---|
| 29 | row = str( row ).replace( "\n", "" ).replace( "\r", "" )
|
---|
| 30 |
|
---|
| 31 | fields = row.split( "</td>" )
|
---|
| 32 |
|
---|
| 33 | org_num = fields[0].split( "list_uids=" )[-1].split( "\"" )[0]
|
---|
| 34 |
|
---|
| 35 | name = fields[1].split( "\">" )[-1].split( "<" )[0]
|
---|
| 36 |
|
---|
| 37 | kingdom = "archaea"
|
---|
| 38 | if "<td class=\"bacteria\" align=\"center\">B" in fields[2]:
|
---|
| 39 | kingdom = "bacteria"
|
---|
| 40 |
|
---|
| 41 | group = fields[3].split( ">" )[-1]
|
---|
| 42 |
|
---|
| 43 | info_url = "%s%s" % ( info_url_base, org_num )
|
---|
| 44 |
|
---|
| 45 | org_genbank = fields[7].split( "\">" )[-1].split( "<" )[0].split( "." )[0]
|
---|
| 46 | org_refseq = fields[8].split( "\">" )[-1].split( "<" )[0].split( "." )[0]
|
---|
| 47 |
|
---|
| 48 | #seems some things donot have an ftp url, try and except it here:
|
---|
| 49 | try:
|
---|
| 50 | ftp_url = fields[22].split( "href=\"" )[1].split( "\"" )[0]
|
---|
| 51 | except:
|
---|
| 52 | print "FAILED TO AQUIRE FTP ADDRESS:", org_num, info_url
|
---|
| 53 | ftp_url = None
|
---|
| 54 |
|
---|
| 55 | chroms = get_chroms_by_project_id( org_num )
|
---|
| 56 |
|
---|
| 57 | yield org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url
|
---|
| 58 |
|
---|
| 59 | def get_chroms_by_project_id( org_num, base_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ):
|
---|
| 60 | html_count = 0
|
---|
| 61 | html = None
|
---|
| 62 | while html_count < 500 and html == None:
|
---|
| 63 | html_count += 1
|
---|
| 64 | url = "%s%s" % ( base_url, org_num )
|
---|
| 65 | try:
|
---|
| 66 | html = urlopen( url )
|
---|
| 67 | except:
|
---|
| 68 | print "GENOME PROJECT FAILED:", html_count, "org:", org_num, url
|
---|
| 69 | html = None
|
---|
| 70 | time.sleep( 1 ) #Throttle Connection
|
---|
| 71 | if html is None:
|
---|
| 72 | "GENOME PROJECT COMPLETELY FAILED TO LOAD", "org:", org_num,"http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids="+org_num
|
---|
| 73 | return None
|
---|
| 74 |
|
---|
| 75 | chroms = []
|
---|
| 76 | for chr_row in BeautifulSoup( html ).findAll( "tr", { "class" : "vvv" } ):
|
---|
| 77 | chr_row = str( chr_row ).replace( "\n","" ).replace( "\r", "" )
|
---|
| 78 | fields2 = chr_row.split( "</td>" )
|
---|
| 79 | refseq = fields2[1].split( "</a>" )[0].split( ">" )[-1]
|
---|
| 80 | #genbank = fields2[2].split( "</a>" )[0].split( ">" )[-1]
|
---|
| 81 | chroms.append( refseq )
|
---|
| 82 |
|
---|
| 83 | return chroms
|
---|
| 84 |
|
---|
| 85 | def get_ftp_contents( ftp_url ):
|
---|
| 86 | ftp_count = 0
|
---|
| 87 | ftp_contents = None
|
---|
| 88 | while ftp_count < 500 and ftp_contents == None:
|
---|
| 89 | ftp_count += 1
|
---|
| 90 | try:
|
---|
| 91 | ftp = FTP( ftp_url.split("/")[2] )
|
---|
| 92 | ftp.login()
|
---|
| 93 | ftp.cwd( ftp_url.split( ftp_url.split( "/" )[2] )[-1] )
|
---|
| 94 | ftp_contents = ftp.nlst()
|
---|
| 95 | ftp.close()
|
---|
| 96 | except:
|
---|
| 97 | ftp_contents = None
|
---|
| 98 | time.sleep( 1 ) #Throttle Connection
|
---|
| 99 | return ftp_contents
|
---|
| 100 |
|
---|
| 101 | def scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url ):
|
---|
| 102 | for file_type, items in desired_ftp_files.items():
|
---|
| 103 | ext = items['ext']
|
---|
| 104 | ftp_filename = "%s.%s" % ( refseq, ext )
|
---|
| 105 | target_filename = os.path.join( org_dir, "%s.%s" % ( refseq, ext ) )
|
---|
| 106 | if ftp_filename in ftp_contents:
|
---|
| 107 | url_count = 0
|
---|
| 108 | url = "%s/%s" % ( ftp_url, ftp_filename )
|
---|
| 109 | results = None
|
---|
| 110 | while url_count < 500 and results is None:
|
---|
| 111 | url_count += 1
|
---|
| 112 | try:
|
---|
| 113 | results = urlretrieve( url, target_filename )
|
---|
| 114 | except:
|
---|
| 115 | results = None
|
---|
| 116 | time.sleep(1) #Throttle Connection
|
---|
| 117 | if results is None:
|
---|
| 118 | "URL COMPLETELY FAILED TO LOAD:", url
|
---|
| 119 | return
|
---|
| 120 |
|
---|
| 121 | #do special processing for each file type:
|
---|
| 122 | if items['parser'] is not None:
|
---|
| 123 | parser_results = globals()[items['parser']]( target_filename, org_num, refseq )
|
---|
| 124 | else:
|
---|
| 125 | print "FTP filetype:", file_type, "not found for", org_num, refseq
|
---|
| 126 | #FTP Files have been Loaded
|
---|
| 127 |
|
---|
| 128 |
|
---|
| 129 | def process_FASTA( filename, org_num, refseq ):
|
---|
| 130 | fasta = []
|
---|
| 131 | fasta = [line.strip() for line in open( filename, 'rb' ).readlines()]
|
---|
| 132 | fasta_header = fasta.pop( 0 )[1:]
|
---|
| 133 | fasta_header_split = fasta_header.split( "|" )
|
---|
| 134 | chr_name = fasta_header_split.pop( -1 ).strip()
|
---|
| 135 | accesions = {fasta_header_split[0]:fasta_header_split[1], fasta_header_split[2]:fasta_header_split[3]}
|
---|
| 136 | fasta = "".join( fasta )
|
---|
| 137 |
|
---|
| 138 | #Create Chrom Info File:
|
---|
| 139 | chrom_info_file = open( os.path.join( os.path.split( filename )[0], "%s.info" % refseq ), 'wb+' )
|
---|
| 140 | chrom_info_file.write( "chromosome=%s\nname=%s\nlength=%s\norganism=%s\n" % ( refseq, chr_name, len( fasta ), org_num ) )
|
---|
| 141 | try:
|
---|
| 142 | chrom_info_file.write( "gi=%s\n" % accesions['gi'] )
|
---|
| 143 | except:
|
---|
| 144 | chrom_info_file.write( "gi=None\n" )
|
---|
| 145 | try:
|
---|
| 146 | chrom_info_file.write( "gb=%s\n" % accesions['gb'] )
|
---|
| 147 | except:
|
---|
| 148 | chrom_info_file.write( "gb=None\n" )
|
---|
| 149 | try:
|
---|
| 150 | chrom_info_file.write( "refseq=%s\n" % refseq )
|
---|
| 151 | except:
|
---|
| 152 | chrom_info_file.write( "refseq=None\n" )
|
---|
| 153 | chrom_info_file.close()
|
---|
| 154 |
|
---|
| 155 | def process_Genbank( filename, org_num, refseq ):
|
---|
| 156 | #extracts 'CDS', 'tRNA', 'rRNA' features from genbank file
|
---|
| 157 | features = get_bed_from_genbank( filename, refseq, ['CDS', 'tRNA', 'rRNA'] )
|
---|
| 158 | for feature in features.keys():
|
---|
| 159 | feature_file = open( os.path.join( os.path.split( filename )[0], "%s.%s.bed" % ( refseq, feature ) ), 'wb+' )
|
---|
| 160 | feature_file.write( '\n'.join( features[feature] ) )
|
---|
| 161 | feature_file.close()
|
---|
| 162 | print "Genbank extraction finished for chrom:", refseq, "file:", filename
|
---|
| 163 |
|
---|
| 164 | def process_Glimmer3( filename, org_num, refseq ):
|
---|
| 165 | try:
|
---|
| 166 | glimmer3_bed = get_bed_from_glimmer3( filename, refseq )
|
---|
| 167 | except Exception, e:
|
---|
| 168 | print "Converting Glimmer3 to bed FAILED! For chrom:", refseq, "file:", filename, e
|
---|
| 169 | glimmer3_bed = []
|
---|
| 170 | glimmer3_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.Glimmer3.bed" % refseq ), 'wb+' )
|
---|
| 171 | glimmer3_bed_file.write( '\n'.join( glimmer3_bed ) )
|
---|
| 172 | glimmer3_bed_file.close()
|
---|
| 173 |
|
---|
| 174 | def process_GeneMarkHMM( filename, org_num, refseq ):
|
---|
| 175 | try:
|
---|
| 176 | geneMarkHMM_bed = get_bed_from_GeneMarkHMM( filename, refseq )
|
---|
| 177 | except Exception, e:
|
---|
| 178 | print "Converting GeneMarkHMM to bed FAILED! For chrom:", refseq, "file:", filename, e
|
---|
| 179 | geneMarkHMM_bed = []
|
---|
| 180 | geneMarkHMM_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMarkHMM.bed" % refseq ), 'wb+' )
|
---|
| 181 | geneMarkHMM_bed_bed_file.write( '\n'.join( geneMarkHMM_bed ) )
|
---|
| 182 | geneMarkHMM_bed_bed_file.close()
|
---|
| 183 |
|
---|
| 184 | def process_GeneMark( filename, org_num, refseq ):
|
---|
| 185 | try:
|
---|
| 186 | geneMark_bed = get_bed_from_GeneMark( filename, refseq )
|
---|
| 187 | except Exception, e:
|
---|
| 188 | print "Converting GeneMark to bed FAILED! For chrom:", refseq, "file:", filename, e
|
---|
| 189 | geneMark_bed = []
|
---|
| 190 | geneMark_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMark.bed" % refseq ), 'wb+' )
|
---|
| 191 | geneMark_bed_bed_file.write( '\n'.join( geneMark_bed ) )
|
---|
| 192 | geneMark_bed_bed_file.close()
|
---|
| 193 |
|
---|
| 194 |
|
---|
| 195 |
|
---|
| 196 | def __main__():
|
---|
| 197 | start_time = time.time()
|
---|
| 198 | base_dir = os.path.join( os.getcwd(), "bacteria" )
|
---|
| 199 | try:
|
---|
| 200 | base_dir = sys.argv[1]
|
---|
| 201 | except:
|
---|
| 202 | print "using default base_dir:", base_dir
|
---|
| 203 |
|
---|
| 204 | try:
|
---|
| 205 | os.mkdir( base_dir )
|
---|
| 206 | print "path '%s' has been created" % base_dir
|
---|
| 207 | except:
|
---|
| 208 | print "path '%s' seems to already exist" % base_dir
|
---|
| 209 |
|
---|
| 210 | for org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url in iter_genome_projects():
|
---|
| 211 | if chroms is None:
|
---|
| 212 | continue #No chrom information, we can't really do anything with this organism
|
---|
| 213 | #Create org directory, if exists, assume it is done and complete --> skip it
|
---|
| 214 | try:
|
---|
| 215 | org_dir = os.path.join( base_dir, org_num )
|
---|
| 216 | os.mkdir( org_dir )
|
---|
| 217 | except:
|
---|
| 218 | print "Organism %s already exists on disk, skipping" % org_num
|
---|
| 219 | continue
|
---|
| 220 |
|
---|
| 221 | #get ftp contents
|
---|
| 222 | ftp_contents = get_ftp_contents( ftp_url )
|
---|
| 223 | if ftp_contents is None:
|
---|
| 224 | "FTP COMPLETELY FAILED TO LOAD", "org:", org_num, "ftp:", ftp_url
|
---|
| 225 | else:
|
---|
| 226 | for refseq in chroms:
|
---|
| 227 | ftp_result = scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url )
|
---|
| 228 | #FTP Files have been Loaded
|
---|
| 229 | print "Org:", org_num, "chrom:", refseq, "[", time.time() - start_time, "seconds elapsed. ]"
|
---|
| 230 |
|
---|
| 231 | #Create org info file
|
---|
| 232 | info_file = open( os.path.join( org_dir, "%s.info" % org_num ), 'wb+' )
|
---|
| 233 | info_file.write("genome project id=%s\n" % org_num )
|
---|
| 234 | info_file.write("name=%s\n" % name )
|
---|
| 235 | info_file.write("kingdom=%s\n" % kingdom )
|
---|
| 236 | info_file.write("group=%s\n" % group )
|
---|
| 237 | info_file.write("chromosomes=%s\n" % ",".join( chroms ) )
|
---|
| 238 | info_file.write("info url=%s\n" % info_url )
|
---|
| 239 | info_file.write("ftp url=%s\n" % ftp_url )
|
---|
| 240 | info_file.close()
|
---|
| 241 |
|
---|
| 242 | print "Finished Harvesting", "[", time.time() - start_time, "seconds elapsed. ]"
|
---|
| 243 | print "[", ( time.time() - start_time )/60, "minutes. ]"
|
---|
| 244 | print "[", ( time.time() - start_time )/60/60, "hours. ]"
|
---|
| 245 |
|
---|
| 246 | if __name__ == "__main__": __main__()
|
---|