In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df_full=pd.read_json('data.json',orient='index')
#Also CSV excel sheets etc.
print df_full.info()
In [3]:
df_full.describe()
Out[3]:
In [4]:
df=df_full[['d','r','x','y','z','error','error_csd','d_min','id','duplicate','fICA_corr']]
In [5]:
df.describe()
Out[5]:
In [6]:
df.head() # first 5 elements head(10) gives 10 etc
#same for tail but not the first but last 5
Out[6]:
In [7]:
df.info()
In [8]:
df=df.dropna() #get rid of NaNs
df.dropna(inplace=True) #get rid of NaNs
#df=df.fillna(0) # fill NaNs with any value
In [9]:
df.info()
In [10]:
df.loc[:,'bin_r'],bin_r=pd.cut(df['r'], np.arange(0,2.25,0.25), retbins=True)
df.loc[:,'bin_d'],bin_d=pd.cut(df['d'], np.arange(0,4.25,0.25), retbins=True)
# you could als do df['bin_d']=pd.cut(df['d'], np.arange(0,4.25,0.25))
# but this is not encouraged for some internal mechanisms reason
In [11]:
df.describe()
Out[11]:
In [12]:
df.info()
In [13]:
# Sort by multiple columns
df.sort_values(['bin_r', 'd'], ascending=[True, False])
In [ ]:
df['bin_d'].hist()
In [15]:
df.rename(columns={'bin_r' : 'r_binned','bin_d' : 'd_binned'}, inplace = True)
In [16]:
#df.info()
df_tabled=pd.crosstab(df['r_binned'],df['d_binned'])
df_tabled
Out[16]:
In [17]:
#plt.pcolor(df_tabled.values)
sns.heatmap(df_tabled)
plt.ylim((0,8))
Out[17]:
In [18]:
df.loc[:,'distance']=np.sqrt(np.square(df['x'])+np.square(df['y'])+np.square(df['z']))
df[['d','distance']].describe()
Out[18]:
In [19]:
def calc_phi(df):
return np.arctan(df['y']/df['x'])
def calc_theta(df):
return np.arccos(df['z']/df['d'])
df['phi'] = df.apply(calc_phi,axis=1) # works on row pd.map works on element
df['theta'] = df.apply(calc_theta,axis=1)
df[['theta','phi']].describe()
df['theta']=calc_theta(df)
In [20]:
ds_string = df['phi'].map('this is a string describing phi = {}'.format)
#ds_string
In [21]:
#df.cov()
In [22]:
df.corr()
Out[22]:
In [23]:
df.hist(figsize=(16,9))
Out[23]:
In [24]:
df_full[['r','d','error']].values
Out[24]:
In [25]:
print df_full['id'].nunique()
print df_full['id'].count()
In [ ]:
In [26]:
grouped_table_2=pd.pivot_table(df,index='r_binned',columns='d_binned',values='error_csd',aggfunc=np.median)
grouped_table_2
Out[26]:
In [27]:
grouped_table=df.groupby(['r_binned','d_binned'])['error'].quantile(0.5).unstack().fillna(5)
grouped_table
Out[27]:
In [28]:
df_sub=df[['r_binned','d_binned','error','error_csd']]
melted_table=pd.melt(df_sub, id_vars=["r_binned",'d_binned'], var_name="error type", value_name="error value")
melted_table=melted_table.sort_values(by=['r_binned','d_binned'])
melted_table
Out[28]:
In [29]:
plt.figure(figsize=(16,9))
sns.boxplot(data=melted_table,x='d_binned',y='error value',hue='error type')
Out[29]:
In [30]:
df_2D=df.groupby(['r_binned','d_binned'])['error']
for name,group in df_2D:
#print name,group
print group.quantile(0.5)-group.mean()
In [ ]:
In [ ]:
In [ ]:
In [31]:
df.info()
In [32]:
df_2D=(
df.groupby(['r_binned','d_binned'])['error']
).quantile(0.5
).unstack(fill_value=2.50
).where(df_tabled>7,2.5)
sns.heatmap(df_2D,cmap='Greys',vmin=0,vmax=2.5)
plt.ylim([0,8])
Out[32]:
In [33]:
df_missed=df_full.loc[df_full['duplicate']==0]
g = (sns.jointplot(x="r", y="d",
data=df_missed, color="k",kind='hex'))
g.ax_joint.set(xlim=(0, 1.05),ylim=(0,4.1))
Out[33]:
In [34]:
g = (sns.jointplot(x="r", y="d",
data=df_missed, color="k",kind='kde'))
g.ax_joint.set(xlim=(0, 1.05),ylim=(0,4.1))
Out[34]:
In [35]:
g = sns.PairGrid(data=df,vars=['phi','theta','error'])
g = g.map_upper(plt.scatter)
g = g.map_lower(sns.kdeplot, cmap="Blues_d")
#g = g.map_diag(sns.kdeplot, lw=3, legend=False)
g = g.map_diag(plt.hist)
g
Out[35]:
In [ ]:
In [38]:
plt.figure(figsize=(16,9))
sns.boxplot(data=df,x='r_binned',y='error')
Out[38]:
In [39]:
plt.figure(figsize=(16,9))
sns.boxplot(data=df,x='r_binned',y='error',hue='d_binned')
Out[39]:
In [40]:
plt.figure(figsize=(16,9))
sns.swarmplot(data=df,x='r_binned',y='error')
Out[40]:
In [41]:
sns.violinplot(data=df,x='r_binned',y='error')
Out[41]:
In [42]:
sns.violinplot(data=melted_table[:],x='r_binned',y='error value',hue='error type',split=True)
Out[42]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: