I will build the url from the logical blocks:
url_base='http://www.ebi.ac.uk/ena/data/warehouse/search?' #base for advanced search
url_query='query=\"tax_tree(11320)\"' #influenza A taxon and all subordinates (tree)
url_result='&result=sample' # looking for samples, they have location
url_count='&resultcount' # count the results
url=url_base+url_query+url_result+url_count #concatenate
print 'The url is:',url #print
Query the url, read the result back as a string
import urllib #python modules for url-s
url_res = urllib.urlopen(url).read()
print url_res
n_samples=int(''.join(url_res.split('\n')[0].split()[-1].split(',')))
Build url again
url_base='http://www.ebi.ac.uk/ena/data/warehouse/search?'
url_query='query=\"tax_tree(11320)\"'
url_result='&result=sample'
url_display='&display=report' #report is the tab separated output
url_fields='&fields=accession,country,collection_date,host,location' #get accesion and location
url_limits='&offset=1&length='+str(n_samples) #get all the results
url=url_base+url_query+url_result+url_display+url_fields+url_limits
print 'The url is:',url
The result is a tab separated table
ena_flu_loco_page = urllib.urlopen(url).read()
Load the table into a pandas DataFrame
import pandas as pd #pandas
from StringIO import StringIO #for reading string into pandas
ena_flu_loco_table = pd.read_csv(StringIO(ena_flu_loco_page),sep='\t')
Peek into the table
ena_flu_loco_table.head()
print "The number of sample with geolocations,and date is: ",
print len(ena_flu_loco_table[
(pd.isnull(ena_flu_loco_table['location']) == False) &
(pd.isnull(ena_flu_loco_table['collection_date']) == False) ])
Get rid of samples with no geolocation
ena_flu_loco_table=ena_flu_loco_table[
(pd.isnull(ena_flu_loco_table['location']) == False) &
(pd.isnull(ena_flu_loco_table['collection_date']) == False) ]
def parse_lat(string_loc):
loc_list=string_loc.split(' ')
if (loc_list[1] =='N'):
return float(loc_list[0])
elif (loc_list[1] =='S'):
return -float(loc_list[0])
def parse_lon(string_loc):
loc_list=string_loc.split(' ')
if (loc_list[3] =='E'):
return float(loc_list[2])
elif (loc_list[3] =='W'):
return -float(loc_list[2])
ena_flu_loco_table['lat']=map(parse_lat,ena_flu_loco_table['location'])
ena_flu_loco_table['lon']=map(parse_lon,ena_flu_loco_table['location'])
ena_flu_loco_table['date']=[x.split('/')[0] for x in ena_flu_loco_table['collection_date']]
ena_flu_loco_table=ena_flu_loco_table[['lat','lon','accession','country',
'date','host']]
Peak into table
ena_flu_loco_table.head()
import pandas as pd
cge_table=pd.DataFrame(columns=['city','google_location','source_note','strain','collection_date',
'country','region','collected_by','longitude','isolation_source',
'pathogenic','latitude','location_note','pathogenicity_note',
'organism','notes','zip_code'])
cge_table['latitude']=ena_flu_loco_table['lat']
cge_table['longitude']=ena_flu_loco_table['lon']
cge_table['country']=ena_flu_loco_table['country']
cge_table['collection_date']=ena_flu_loco_table['date']
cge_table['isolation_source']=ena_flu_loco_table['host']
Write it out in json to a location where the map will read
cge_table.to_json('json/influenza_data.js',orient='records')
from IPython.display import HTML
HTML('''
<div class="wrap">
<iframe class="frame" src="index.html"></iframe>
</div>
<style>
wrap {
width: 1px;
height: 1px;
padding: 0;
overflow: hidden;
}
.frame {
width: 1050px;
height: 780px;
border: 0;
-ms-transform: scale(0.9);
-moz-transform: scale(0.9);
-o-transform: scale(0.9);
-webkit-transform: scale(0.9);
transform: scale(0.9);
-ms-transform-origin: 0 0;
-moz-transform-origin: 0 0;
-o-transform-origin: 0 0;
-webkit-transform-origin: 0 0;
transform-origin: 0 0;
}
</style>
''')