In [2]:
%matplotlib inline
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys, os
sys.path.append("/Python")
from myfunctions import *
Very simple and fast inspection of the dataset:
In [3]:
df = pd.read_csv("../data/aDRAC.csv", encoding='utf8')
display(df.head())
In [4]:
bins = df['C14AGE'].max() - df['C14AGE'].min()
df['C14AGE'].hist(color = 'grey', bins = round(bins/1000), histtype='stepfilled', fc='#CCCCCC').invert_xaxis()
plt.axvline(x = 1950, color='r', label = 'BC/AD')
legend = plt.legend(loc=2)
legend.get_frame().set_facecolor('white')
In [31]:
df['C14AGE'].describe()
Out[31]:
Which sites deliver the most dates? (Top 10)
In [32]:
df_pivot = df.pivot_table(values = 'C14AGE', index = 'SITE', aggfunc = [len, np.min, np.mean, np.max])
df_pivot = df_pivot.reset_index()
df_pivot = df_pivot.sort_values(by=['len'], ascending=[False])
# display(df_pivot.head())
df_pivot[:10]
Out[32]:
A distribution of the amounts of dates per site:
In [33]:
df_pivot.plot(x = 'SITE', y = 'len', legend=False)
Out[33]:
In [34]:
df_pivot['len'].describe()
Out[34]:
How to get the amount of dates per site and feature:
In [35]:
df_pivot = df.pivot_table(values = 'C14AGE', index = ['SITE', 'FEATURE'], aggfunc = [len, np.min, np.mean, np.max])
df_pivot[:10]
Out[35]:
In [36]:
df.pivot_table(values = 'C14AGE', index = ['FEATURE_DESC'], aggfunc = len)
Out[36]:
As decimal degrees:
In [46]:
pd.Series({'X/LONG - Min':np.min(df['LONG']),
'X/LONG - Max':np.max(df['LONG']),
'Y/LAT - Min':np.min(df['LAT']),
'Y/LAT - Max':np.max(df['LAT'])})
Out[46]:
As degree/minute/second:
In [47]:
pd.Series({'X/LONG - Min':decimalDegrees2DMS(np.min(df['LONG']), 'Longitude'),
'X/LONG - Max':decimalDegrees2DMS(np.max(df['LONG']), 'Longitude'),
'Y/LAT - Min':decimalDegrees2DMS(np.min(df['LAT']), 'Latitude'),
'Y/LAT - Max':decimalDegrees2DMS(np.max(df['LAT']), 'Latitude')})
Out[47]:
How many dates per country?
In [55]:
df_pivot = df.pivot_table(values = 'C14AGE', index = 'COUNTRY', aggfunc = [len])
df_pivot = df_pivot.reset_index()
df_pivot = df_pivot.sort_values(by=['len'], ascending=[False])
# display(df_pivot.head())
df_pivot
Out[55]:
In [ ]: