Xiaoliang Jiang, Yingjun Guan, Xinyu Zhang, Jialu Wang.
You should write a function: This function will receive a pandas dataframe object and a filename: 1.names 2.dates 3.latitude 4.longitude 5.categorical 6.quant1 7.quant2 8.quant3
save a PNG file to the supplied filename. The dates column will be in seconds since the epoch.
For first assumption, as the structure is given as above, we use several index number to find the columns. For the second one, as the latitudes and longitudes are worldwide, we use a rotatable globe rather than a map to show the distribution. For the last assumption, we concentrated on more about one quantitive values with several different attributes as longitude, latitude and category, but not the relationship between quantitive values. We also did not compare with the quantitive values directly in on plot, since if they are in different orders of magnitude, the comparation would not make sense. In addition, although some quantitive values may have correlations, but in general, the quantitives are more likely independent to each other. So what we finished is more concentrate on one single quantitive value, but not all of them.
Strength:
Beyond the required function, our function has several strengths.
Weakness:
Wish to do:
Reference: https://plot.ly/python/mixed-subplots/
In [228]:
# By Xiaoliang Jiang
import plotly
import plotly.plotly as py
from plotly.graph_objs import *
import pandas as pd
import math
from IPython.display import Image
import time
plotly.tools.set_credentials_file(username='xjiang36', api_key='uZyWsdSH3xd9bxUefIFf')
In [229]:
#By Xiaoliang Jiang
def Whaleteam(dataset='Q3-moreCtgSample.csv',topn=5,quantnum=1,savetype="inchunk",startyear=-99999,startmth=1,endyear=99999,endmth=12):
for main in range(1):
if savetype=="inchunk":
break
if savetype=="online":
break
if savetype=="png":
break
if savetype=="jpeg":
break
else:
return "Error: invalid input of \"savetype\""
colnum=quantnum+4
dftemp = pd.read_csv(dataset,encoding='iso-8859-1')
colnames=dftemp.columns.values
for main in range(1):
if startyear>endyear:
return "Error: wrong input of \"startyear\" or \"endyear\",\"startyear\" cannot be earlier than \"endyear\"."
if startmth>12:
return "Error: wrong input of \"startmth\"."
if startmth<1:
return "Error: wrong input of \"startmth\"."
if endmth>12:
return "Error: wrong input of \"endmth\"."
if endmth<1:
return "Error: wrong input of \"endmth\"."
if type(startyear)!=int:
return "Error: wrong input of \"startyear\"."
if type(endyear)!=int:
return "Error: wrong input of \"endyear\"."
if type(startmth)!=int:
return "Error: wrong input of \"startmth\"."
if type(endmth)!=int:
return "Error: wrong input of \"endmth\"."
if startyear==endyear:
if startmth>endmth:
return "Error: wrong input of \"startmth\" or \"endmth\"."
if startyear<time.gmtime(min(dftemp[colnames[1]]))[0]:
startyear=time.gmtime(min(dftemp[colnames[1]]))[0]
if endyear>time.gmtime(max(dftemp[colnames[1]]))[0]:
endyear=time.gmtime(max(dftemp[colnames[1]]))[0]
sub=[]
for i in range(len(dftemp[colnames[1]])):
if time.gmtime(dftemp[colnames[1]][i])[0]>=startyear:
if time.gmtime(dftemp[colnames[1]][i])[1]>=startmth:
sub.append(i)
dftemp1=dftemp.loc[sub]
#print (len(dftemp1))
sub2=[]
for i in range(len(dftemp1.index)):
if time.gmtime(dftemp[colnames[1]][i])[0]<=endyear:
if time.gmtime(dftemp[colnames[1]][i])[1]<=endmth:
sub2.append(i)
df1=dftemp1.loc[sub2]
df2=df1.dropna(how='all')
df3=df2.reset_index()
df3colnames=df3.columns
df=df3[df3colnames[1:]]
#print (df)
#print (len(df))
for i in range(1):
if quantnum<1:
return "Error: wrong input of \"quantnum\", which should be an possitive integer."
elif (quantnum+5)>len(colnames):
return "Error: wrong input of \"quantnum\". No that many quantitive variable."
elif type(quantnum)!=int:
return "Error: wrong input of \"quantnum\", which should be an possitive integer."
freq=df[colnames[4]].value_counts().reset_index().rename(columns={'index': 'x'})
freqcol=freq.columns.values
for i in range(1):
if len(freq)<topn:
print ("freq=%s"%freq)
print ("topn=%s"%topn)
return "Error: wrong input of \"topn\", No that many categories."
elif topn<1:
return "Error: wrong input of \"topn\", which should be an possitive integer."
elif type(topn)!=int:
return "Error: wrong input of \"topn\", which should be an possitive integer."
# Top 10 category in colorful, others in grey
colorbar=["#FF6666","#FFB266","#FFFF66","#66FF66","#66FFFF","#66B2FF","#6666FF","#B266FF","#FF66FF","#FF66B2","#C0C0C0"]
colorbars=[]
for i in range(len(df[colnames[4]])):
#for i in range(len(df.index)):
for j in range(len(freq)):
if df[colnames[4]][i]==freq["x"][j]:
if j<(topn):
colorbars.append(colorbar[j])
else:
colorbars.append(colorbar[10])
sizes=[]
for i in range(len(df[colnames[colnum]])):
if df[colnames[colnum]][i]>df[colnames[colnum]].median():
if df[colnames[colnum]][i]>(df[colnames[colnum]][df[colnames[colnum]]>df[colnames[colnum]].median()].median()):
sizes.append(15)
else:
sizes.append(9)
else:
if df[colnames[colnum]][i]>(df[colnames[colnum]][df[colnames[colnum]]<df[colnames[colnum]].median()].median()):
sizes.append(6)
else:
sizes.append(3)
colorbars1=[]
for i in range(topn):
colorbars1.append(colorbar[i])
colorbars1.append(colorbar[10])
#topn=5
topfreq=freq[:(topn+1)]
topfreq.set_value(topn,freqcol[0],"Other")
topfreq.set_value(topn,freqcol[1],sum(freq[freqcol[1]][topn:]))
locations = Bar(x=topfreq[freqcol[0]],y=freq[freqcol[1]], marker=dict(color=colorbars1))
trace3 = {
"geo": "geo3",
"lon": df[colnames[3]],
"lat": df[colnames[2]],
"hoverinfo": 'text',
"marker": {
"size": sizes,
"opacity": 0.8,
"color": colorbars,
"colorscale": 'Viridis'
},
"mode": "markers",
"type": "scattergeo"
}
data = Data([locations, trace3])
# control the subplot below using domain in 'geo', 'scene', and 'axis'
layout = {
"plot_bgcolor": 'black',
"paper_bgcolor": 'black',
"titlefont": {
"size": 20,
"family": "Raleway"
},
"font": {
"color": 'white'
},
"dragmode": "zoom",
"geo3": {
"domain": {
"x": [0, 0.55],
"y": [0.18, 0.9]
},
"lakecolor": "rgba(127,205,255,1)",
"oceancolor": "rgb(6,66,115)",
"landcolor": 'white',
"projection": {"type": "orthographic"},
"scope": "world",
"showlakes": True,
"showocean": True,
"showland": True,
"bgcolor": 'black'
},
"margin": {
"r": 10,
"t": 25,
"b": 40,
"l": 60
},
"scene": {"domain": {
"x": [0.5, 1],
"y": [0, 0.55]
},
"xaxis": {"gridcolor": 'white'},
"yaxis": {"gridcolor": 'white'},
"zaxis": {"gridcolor": 'white'}
},
"showlegend": False,
"title": "<br>Overview of \"%s\" (top%s) from %s/%s to %s/%s" % (colnames[colnum],topn,startyear,startmth,endyear,endmth),
"xaxis": {
"anchor": "y",
"domain": [0.6, 0.95]
},
"yaxis": {
"anchor": "x",
"domain": [0.1, 0.9],
"showgrid": False
}
}
annotations = { "text": "Histrogram of \"%s\" (top%s)" % (colnames[colnum],topn),
"showarrow": False,
"xref": "paper",
"yref": "paper",
"x": 0.85,
"y": 0.95}
annotations2 = { "text": "0%% to 25%%:%s to %s".ljust(30)%(round(df[colnames[colnum]].min(),4),round(df[colnames[colnum]][df[colnames[colnum]]<df[colnames[colnum]].median()].median(),4))+"<br>"+"25%% to 50%%:%s to %s".ljust(30)%(round(df[colnames[colnum]][df[colnames[colnum]]<df[colnames[colnum]].median()].median(),4),round(df[colnames[colnum]].median(),4))+"<br>"+"50%% to 75%%:%s to %s".ljust(30)%(round(df[colnames[colnum]].median(),4),round(df[colnames[colnum]][df[colnames[colnum]]>df[colnames[colnum]].median()].median(),4))+"<br>"+"75%% to 100%%:%s to %s".ljust(30)%(round(df[colnames[colnum]][df[colnames[colnum]]>df[colnames[colnum]].median()].median(),4),round(df[colnames[colnum]].max(),4)),
"showarrow": False,
"xref": "paper",
"yref": "paper",
"x": 0,
"y": 0}
annotations3 = { "text":"Top3 position:<br>%s <br>%s <br>%s".ljust(15)%(dftemp[colnames[0]][dftemp.sort_values(colnames[colnum], ascending=False)[colnames[0]].index[0]],dftemp[colnames[0]][dftemp.sort_values(colnames[colnum], ascending=False)[colnames[0]].index[1]],dftemp[colnames[0]][dftemp.sort_values(colnames[colnum], ascending=False)[colnames[0]].index[2]]),
"showarrow": False,
"xref": "paper",
"yref": "paper",
"x": 0.35,
"y": 0}
layout['annotations'] = [annotations,annotations2,annotations3]
fig = Figure(data=data, layout=layout)
for i in range(1):
if savetype=="inchunk":
py.image.ishow(fig)
elif savetype=="online":
py.iplot(fig, filename = "Whale-plot-finalversion")
elif savetype=="png":
py.image.save_as(fig,'Whale-plot.png')
elif savetype=="jpeg":
py.image.save_as(fig,'Whale-plot.jpeg')
else:
return "print default save type: in chunk"
#py.image.save_as(fig,"Histrogram of \"%s\" (top%s).png" % (colnames[colnum],topn))
#Image("Histrogram of \"%s\" (top%s).png" % (colnames[colnum],topn))
#Image('Whale-plot.png')
#Image('Whale-plot.jpeg')
In [221]:
Whaleteam(dataset='Q3-moreCtgSample.csv',topn=10,quantnum=2,savetype="inchunk",startyear=2016,startmth=6,endyear=2016,endmth=8)
In [230]:
Whaleteam(dataset='Q3-moreCtgSample.csv',topn=10,quantnum=2,savetype="online",startyear=2016,startmth=6,endyear=2016,endmth=8)
In [223]:
Whaleteam(dataset='t3sample.csv',topn=5,quantnum=2,savetype="inchunk",startyear=2016,startmth=6,endyear=2016,endmth=8)
In [224]:
Whaleteam()
In [ ]: