I will build the url from the logical blocks:
url_base='http://www.ebi.ac.uk/ena/data/warehouse/search?' #base for advanced search
url_query='query=\"geo_box1(30,-30,72,58)\"' # all samples around europe
url_result='&result=sample' # looking for samples, they have location
url_count='&resultcount' # count the results
url=url_base+url_query+url_result+url_count #concatenate
print 'The url is:',url #print
Query the url, read the result back as a string
import urllib #python modules for url-s
res = urllib.urlopen(url).read()
print res
n_sample=int(''.join(res.split('\n')[0].split(' ')[-1].split(',')))
print "Number of samples: ",n_sample
Build url again
url_base='http://www.ebi.ac.uk/ena/data/warehouse/search?'
url_query='query=\"geo_box1(30,-30,72,58)\"'
url_result='&result=sample'
url_display='&display=report' #report is the tab separated output
url_fields='&fields=accession,location' #get accesion and location
url_limits='&offset=1&length='+str(n_sample) #get all the results
url=url_base+url_query+url_result+url_display+url_fields+url_limits
print 'The url is:',url #print
The result is a tab separated table, I will download the table to a string
ena_flu_loco_page = urllib.urlopen(url).read()
Load the table into a pandas DataFrame
import pandas as pd #pandas
from StringIO import StringIO #for reading string into pandas
ena_flu_loco_table = pd.read_csv(StringIO(ena_flu_loco_page),sep='\t')
Peek into the table
ena_flu_loco_table.head()
def parse_lat(string_loc):
loc_list=string_loc.split(' ')
if (loc_list[1] =='N'):
return float(loc_list[0])
elif (loc_list[1] =='S'):
return -float(loc_list[0])
def parse_lon(string_loc):
loc_list=string_loc.split(' ')
if (loc_list[3] =='E'):
return float(loc_list[2])
elif (loc_list[3] =='W'):
return -float(loc_list[2])
ena_flu_loco_table['lat']=map(parse_lat,ena_flu_loco_table['location'])
ena_flu_loco_table['lon']=map(parse_lon,ena_flu_loco_table['location'])
ena_flu_loco_table=ena_flu_loco_table[['lat','lon','accession']]
ena_flu_loco_table.head()
print 'Number of unique locations:',
print len(ena_flu_loco_table.groupby(['lat','lon']).size().reset_index())
Contents:
I am using the sql-like groupby statement for group the samples
#the function used for grouping
def form_acc(x):
if (x['accession'].size < 5):
return pd.Series(
dict({'count' : x['accession'].size, 'acc_list' : ' '.join(x['accession']),
}))
else:
return pd.Series(
dict({'count' : x['accession'].size, 'acc_list' : ' '.join(list(
x['accession'])[:2]) + ' ... ' + ' '.join(list(
x['accession'])[-2:])}))
#group-by
uniq_locs_w_acc=ena_flu_loco_table.groupby(['lat','lon']).apply(form_acc).reset_index()
I will use the Folium library which is python wrapper for the Leaflet javasript library for map based visualizations
First define the map drawing function
from IPython.core.display import HTML
import folium
def inline_map(m, width=650, height=500):
"""Takes a folium instance and embed HTML."""
m._build_map()
srcdoc = m.HTML.replace('"', '"')
embed = HTML('<iframe srcdoc="{}" '
'style="width: {}px; height: {}px; '
'border: none"></iframe>'.format(srcdoc, width, height))
return embed
Initialize the map object
width, height = 650, 500
flu_map = folium.Map(location=[47, -17], zoom_start=3,
tiles='OpenStreetMap', width=width, height=height)
Add point to the map object
for i in xrange(len(uniq_locs_w_acc)):
loc=(uniq_locs_w_acc.iloc[i]['lat'],uniq_locs_w_acc.iloc[i]['lon'] )
name='Number of cases: '+str(uniq_locs_w_acc.iloc[i]['count'])
name+=' Accesions: '+uniq_locs_w_acc.iloc[i]['acc_list']
size=uniq_locs_w_acc.iloc[i]['count'] ** 0.5
flu_map.circle_marker(location=loc, radius=1e3*size,
line_color='none',fill_color='#3186cc',
fill_opacity=0.7, popup=name)
And finally draw the map
inline_map(flu_map)
Memory footprint:
Map:
Some small details: