In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
%matplotlib inline
In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
In [3]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv")
In [4]:
raw_dataset.shape
Out[4]:
In [5]:
raw_dataset.head()
Out[5]:
In [6]:
raw_dataset.dtypes.value_counts()
Out[6]:
In [7]:
raw_dataset.isnull().sum().head(10)
Out[7]:
In [8]:
summary = raw_dataset.describe().transpose()
print summary.head(15)
In [9]:
plt.hist(raw_dataset['age'].dropna());
In [10]:
# Attractiveness
plt.hist(raw_dataset['attr_o'].dropna());
In [11]:
# Sincere
plt.hist(raw_dataset['sinc_o'].dropna());
In [12]:
# Intelligent
plt.hist(raw_dataset['intel_o'].dropna()) ;
In [13]:
# Fun
plt.hist(raw_dataset['fun_o'].dropna());
In [14]:
# Ambitious
plt.hist(raw_dataset['amb_o'].dropna());
In [15]:
raw_dataset.groupby('date').iid.nunique().sort_values(ascending=False)
Out[15]:
In [16]:
raw_dataset.groupby('go_out').iid.nunique().sort_values(ascending=False)
Out[16]:
In [17]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(10)
Out[17]:
In [ ]:
In [ ]: