In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
In [2]:
columns = ['Datetime', 'ID', 'Course', 'Python_Experience', 'Favourite_Language']
df1 = pd.read_csv('Data Fusion.csv', names=columns, skiprows=1)
df1.set_index('ID', inplace=True)
df1.head()
Out[2]:
In [3]:
len(df1)
Out[3]:
In [4]:
df1.dtypes
#type(df1.Datetime[0])
Out[4]:
In [5]:
df1['Datetime'] = pd.to_datetime(df1['Datetime'])
In [6]:
df1.index.is_unique
Out[6]:
In [7]:
## if non-unique idex we can test inspect the duplicate indicies
indices = df1.groupby(level=0)
duplicate_indices = []
for index, gi in indices:
if len(gi) > 1:
print("Repeated index: ", index)
duplicate_indices.append(gi)
In [8]:
duplicate_indices[0]
Out[8]:
In [9]:
df1 = df1.loc[~df1.index.duplicated(keep='first')]
print(df1.index.is_unique)
df1.head()
Out[9]:
In [10]:
df1.index.is_unique
Out[10]:
In [11]:
colums = ['Datetime', 'ID', 'Course', 'Ethics_Importance', 'Female_Developers', 'Favourite_Movie', 'Fusion_versus_Wrangling']
df2 = pd.read_csv('Data Fusion 2.csv', names=colums, skiprows=1)
df2.set_index('ID', inplace=True)
df2['Datetime'] = pd.to_datetime(df2['Datetime'])
df2.head()
Out[11]:
In [12]:
df2.index.is_unique
## if not unique...as above
Out[12]:
In [13]:
resultset1_ids = set(df1.index.unique())
resultset2_ids = set(df2.index.unique())
In [14]:
try:
assert resultset1_ids == resultset2_ids
except AssertionError:
print("Warning: ID columns do not match exactly.")
In [15]:
intersection = resultset1_ids.intersection(resultset2_ids)
difference = resultset1_ids.difference(resultset2_ids)
print(intersection)
print(difference)
In [16]:
df2.loc[df2.index.isin(intersection)]
Out[16]:
In [17]:
df1.loc[~df1.index.isin(intersection)].head()
Out[17]:
In [18]:
df2.loc[~df2.index.isin(intersection)].head()
Out[18]:
In [20]:
dodgy_ids = ['MM']
replacement_ids = ['MJM']
replace = lambda x: x if x not in dodgy_ids else replacement_ids[dodgy_ids.index(x)]
df1_corrected = df1.assign(NID = [replace(ii) for ii in df1.index]).set_index('NID')
df2_corrected = df2.assign(NID = [replace(ii) for ii in df2.index]).set_index('NID')
In [22]:
#df1.drop(columns=['NID'], inplace=True)
#df2_corrected = df2
#df1_corrected = df1
In [23]:
resultset1_ids = set(df1_corrected.index.unique())
resultset2_ids = set(df2_corrected.index.unique())
intersection = resultset1_ids.intersection(resultset2_ids)
difference = resultset1_ids.difference(resultset2_ids)
print(intersection)
print(difference)
In [24]:
df1_corrected.Course.value_counts()
Out[24]:
In [25]:
df2_corrected.Course.value_counts()
Out[25]:
In [26]:
set(df1_corrected.Course.unique()) == set(df2_corrected.Course.unique())
Out[26]:
We see the problem of inconsistent encoding of categorical variables that was disussed in the lecture! Fixing this would take some manual effort. A better solution would be to collect the data different (e.g. multiple choice), but we don't always have control over the data collection.
In [ ]:
In [27]:
#df1_corrected.drop(columns=['Datetime'], inplace=True)
#df2_corrected.drop(columns=['Datetime', 'Course'], inplace=True)
#joined = df1_corrected.loc[df1_corrected.index.isin(intersection)].merge(df2_corrected.loc[df2_corrected.index.isin(intersection)], left_index=True, right_index=True)
joined = df1_corrected.merge(df2_corrected, left_index=True, right_index=True, how='outer')
In [28]:
joined.head()
Out[28]:
In [32]:
## To see the rows with missing data because they did not fill in the second form:
joined.loc[~joined.index.isin(intersection)]
Out[32]:
In [33]:
joined.loc[joined.Course_x != joined.Course_y]
Out[33]:
In [36]:
most_recent = lambda row: row.Course_x if ((row.Datetime_x > row.Datetime_y) & (pd.notnull(row.Course_x))) else row.Course_y
joined = joined.assign(Course = [ci if ci==cj else most_recent(joined.iloc[ii]) for ii, (ci,cj) in enumerate(zip(joined.Course_x, joined.Course_y))])
joined.drop(columns=['Course_x', 'Course_y'], inplace=True)
joined.head()
Out[36]:
In [37]:
joined.Course.value_counts()
Out[37]:
In [38]:
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.scatter(joined.Python_Experience, joined.Ethics_Importance)
plt.xlabel('Python_Experience')
plt.ylabel('Ethics_Importance');
plt.subplot(1,2,2)
plt.scatter(joined.Python_Experience, joined.Female_Developers)
plt.xlabel('Python_Experience')
plt.ylabel('Female_Developers');
In [39]:
joined.Favourite_Language.value_counts()
Out[39]:
In [40]:
joined.Favourite_Movie.value_counts()
Out[40]:
In [48]:
gi = 6
groups = list(joined.groupby('Course'))
print("There are %d courses." %len(groups))
plt.title(groups[gi][0])
groups[gi][1].Favourite_Language.value_counts().plot.pie()
plt.axis('off');
In [49]:
joined.loc[joined['Course'] == 'Digital Health']
#joined[joined[['Course']] == 'Digital Health']#[['Favourite_Movie']]
Out[49]:
In [51]:
def plot_movie_pie(group_on='Course', gi='Computer Science'):
plt.title(gi)
joined.loc[joined[group_on] == gi, 'Favourite_Movie'].value_counts().plot.pie()
plt.axis('off');
plot_movie_pie() #gi='Engineering Mathematics')
In [ ]: