#!/usr/bin/env python #Dan Blankenberg #Harvest Bacteria #Connects to NCBI's Microbial Genome Projects website and scrapes it for information. #Downloads and converts annotations for each Genome import sys, os, time from urllib2 import urlopen from urllib import urlretrieve from ftplib import FTP from BeautifulSoup import BeautifulSoup from util import get_bed_from_genbank, get_bed_from_glimmer3, get_bed_from_GeneMarkHMM, get_bed_from_GeneMark assert sys.version_info[:2] >= ( 2, 4 ) #this defines the types of ftp files we are interested in, and how to process/convert them to a form for our use desired_ftp_files = {'GeneMark':{'ext':'GeneMark-2.5f','parser':'process_GeneMark'}, 'GeneMarkHMM':{'ext':'GeneMarkHMM-2.6m','parser':'process_GeneMarkHMM'}, 'Glimmer3':{'ext':'Glimmer3','parser':'process_Glimmer3'}, 'fna':{'ext':'fna','parser':'process_FASTA'}, 'gbk':{'ext':'gbk','parser':'process_Genbank'} } #number, name, chroms, kingdom, group, genbank, refseq, info_url, ftp_url def iter_genome_projects( url = "http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1", info_url_base = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ): for row in BeautifulSoup( urlopen( url ) ).findAll( name = 'tr', bgcolor = ["#EEFFDD", "#E8E8DD"] ): row = str( row ).replace( "\n", "" ).replace( "\r", "" ) fields = row.split( "" ) org_num = fields[0].split( "list_uids=" )[-1].split( "\"" )[0] name = fields[1].split( "\">" )[-1].split( "<" )[0] kingdom = "archaea" if "B" in fields[2]: kingdom = "bacteria" group = fields[3].split( ">" )[-1] info_url = "%s%s" % ( info_url_base, org_num ) org_genbank = fields[7].split( "\">" )[-1].split( "<" )[0].split( "." )[0] org_refseq = fields[8].split( "\">" )[-1].split( "<" )[0].split( "." )[0] #seems some things donot have an ftp url, try and except it here: try: ftp_url = fields[22].split( "href=\"" )[1].split( "\"" )[0] except: print "FAILED TO AQUIRE FTP ADDRESS:", org_num, info_url ftp_url = None chroms = get_chroms_by_project_id( org_num ) yield org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url def get_chroms_by_project_id( org_num, base_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ): html_count = 0 html = None while html_count < 500 and html == None: html_count += 1 url = "%s%s" % ( base_url, org_num ) try: html = urlopen( url ) except: print "GENOME PROJECT FAILED:", html_count, "org:", org_num, url html = None time.sleep( 1 ) #Throttle Connection if html is None: "GENOME PROJECT COMPLETELY FAILED TO LOAD", "org:", org_num,"http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids="+org_num return None chroms = [] for chr_row in BeautifulSoup( html ).findAll( "tr", { "class" : "vvv" } ): chr_row = str( chr_row ).replace( "\n","" ).replace( "\r", "" ) fields2 = chr_row.split( "" ) refseq = fields2[1].split( "" )[0].split( ">" )[-1] #genbank = fields2[2].split( "" )[0].split( ">" )[-1] chroms.append( refseq ) return chroms def get_ftp_contents( ftp_url ): ftp_count = 0 ftp_contents = None while ftp_count < 500 and ftp_contents == None: ftp_count += 1 try: ftp = FTP( ftp_url.split("/")[2] ) ftp.login() ftp.cwd( ftp_url.split( ftp_url.split( "/" )[2] )[-1] ) ftp_contents = ftp.nlst() ftp.close() except: ftp_contents = None time.sleep( 1 ) #Throttle Connection return ftp_contents def scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url ): for file_type, items in desired_ftp_files.items(): ext = items['ext'] ftp_filename = "%s.%s" % ( refseq, ext ) target_filename = os.path.join( org_dir, "%s.%s" % ( refseq, ext ) ) if ftp_filename in ftp_contents: url_count = 0 url = "%s/%s" % ( ftp_url, ftp_filename ) results = None while url_count < 500 and results is None: url_count += 1 try: results = urlretrieve( url, target_filename ) except: results = None time.sleep(1) #Throttle Connection if results is None: "URL COMPLETELY FAILED TO LOAD:", url return #do special processing for each file type: if items['parser'] is not None: parser_results = globals()[items['parser']]( target_filename, org_num, refseq ) else: print "FTP filetype:", file_type, "not found for", org_num, refseq #FTP Files have been Loaded def process_FASTA( filename, org_num, refseq ): fasta = [] fasta = [line.strip() for line in open( filename, 'rb' ).readlines()] fasta_header = fasta.pop( 0 )[1:] fasta_header_split = fasta_header.split( "|" ) chr_name = fasta_header_split.pop( -1 ).strip() accesions = {fasta_header_split[0]:fasta_header_split[1], fasta_header_split[2]:fasta_header_split[3]} fasta = "".join( fasta ) #Create Chrom Info File: chrom_info_file = open( os.path.join( os.path.split( filename )[0], "%s.info" % refseq ), 'wb+' ) chrom_info_file.write( "chromosome=%s\nname=%s\nlength=%s\norganism=%s\n" % ( refseq, chr_name, len( fasta ), org_num ) ) try: chrom_info_file.write( "gi=%s\n" % accesions['gi'] ) except: chrom_info_file.write( "gi=None\n" ) try: chrom_info_file.write( "gb=%s\n" % accesions['gb'] ) except: chrom_info_file.write( "gb=None\n" ) try: chrom_info_file.write( "refseq=%s\n" % refseq ) except: chrom_info_file.write( "refseq=None\n" ) chrom_info_file.close() def process_Genbank( filename, org_num, refseq ): #extracts 'CDS', 'tRNA', 'rRNA' features from genbank file features = get_bed_from_genbank( filename, refseq, ['CDS', 'tRNA', 'rRNA'] ) for feature in features.keys(): feature_file = open( os.path.join( os.path.split( filename )[0], "%s.%s.bed" % ( refseq, feature ) ), 'wb+' ) feature_file.write( '\n'.join( features[feature] ) ) feature_file.close() print "Genbank extraction finished for chrom:", refseq, "file:", filename def process_Glimmer3( filename, org_num, refseq ): try: glimmer3_bed = get_bed_from_glimmer3( filename, refseq ) except Exception, e: print "Converting Glimmer3 to bed FAILED! For chrom:", refseq, "file:", filename, e glimmer3_bed = [] glimmer3_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.Glimmer3.bed" % refseq ), 'wb+' ) glimmer3_bed_file.write( '\n'.join( glimmer3_bed ) ) glimmer3_bed_file.close() def process_GeneMarkHMM( filename, org_num, refseq ): try: geneMarkHMM_bed = get_bed_from_GeneMarkHMM( filename, refseq ) except Exception, e: print "Converting GeneMarkHMM to bed FAILED! For chrom:", refseq, "file:", filename, e geneMarkHMM_bed = [] geneMarkHMM_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMarkHMM.bed" % refseq ), 'wb+' ) geneMarkHMM_bed_bed_file.write( '\n'.join( geneMarkHMM_bed ) ) geneMarkHMM_bed_bed_file.close() def process_GeneMark( filename, org_num, refseq ): try: geneMark_bed = get_bed_from_GeneMark( filename, refseq ) except Exception, e: print "Converting GeneMark to bed FAILED! For chrom:", refseq, "file:", filename, e geneMark_bed = [] geneMark_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMark.bed" % refseq ), 'wb+' ) geneMark_bed_bed_file.write( '\n'.join( geneMark_bed ) ) geneMark_bed_bed_file.close() def __main__(): start_time = time.time() base_dir = os.path.join( os.getcwd(), "bacteria" ) try: base_dir = sys.argv[1] except: print "using default base_dir:", base_dir try: os.mkdir( base_dir ) print "path '%s' has been created" % base_dir except: print "path '%s' seems to already exist" % base_dir for org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url in iter_genome_projects(): if chroms is None: continue #No chrom information, we can't really do anything with this organism #Create org directory, if exists, assume it is done and complete --> skip it try: org_dir = os.path.join( base_dir, org_num ) os.mkdir( org_dir ) except: print "Organism %s already exists on disk, skipping" % org_num continue #get ftp contents ftp_contents = get_ftp_contents( ftp_url ) if ftp_contents is None: "FTP COMPLETELY FAILED TO LOAD", "org:", org_num, "ftp:", ftp_url else: for refseq in chroms: ftp_result = scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url ) #FTP Files have been Loaded print "Org:", org_num, "chrom:", refseq, "[", time.time() - start_time, "seconds elapsed. ]" #Create org info file info_file = open( os.path.join( org_dir, "%s.info" % org_num ), 'wb+' ) info_file.write("genome project id=%s\n" % org_num ) info_file.write("name=%s\n" % name ) info_file.write("kingdom=%s\n" % kingdom ) info_file.write("group=%s\n" % group ) info_file.write("chromosomes=%s\n" % ",".join( chroms ) ) info_file.write("info url=%s\n" % info_url ) info_file.write("ftp url=%s\n" % ftp_url ) info_file.close() print "Finished Harvesting", "[", time.time() - start_time, "seconds elapsed. ]" print "[", ( time.time() - start_time )/60, "minutes. ]" print "[", ( time.time() - start_time )/60/60, "hours. ]" if __name__ == "__main__": __main__()