In [1]:
# standard packages
import sys
import pickle
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# custom packages
from scripts.tools import df_check_stats, game, sam_pickle_save, sam_pickle_load
from scripts.tools import df_check_stats, MarkdownMagics
from scripts.sam_value_counts import sam_dataframe_cols_value_count_analysis, sam_dataframe_markup_value_counts
from scripts.sam_confusion_matrix import sam_plot_confusion_matrix, sam_confusion_maxtrix
# extension - %%asmarkdown
from IPython.core.getipython import get_ipython
get_ipython().register_magics(MarkdownMagics)
%matplotlib inline
plt.style.use('ggplot')
sns.set(color_codes=True)
crazy_list = ()
In [2]:
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')
In [3]:
df_check_stats(RAW_X, RAW_TEST_X, RAW_y)
# 39 Features, 59K Records
In [4]:
RAW_y.status_group.value_counts() / RAW_y.size
# We can see that its a multi class supervised learning
Out[4]:
In [5]:
RAW_X.head()
# We can already see that cols like water_quality, quality_group pairs with good correlation
Out[5]:
In [6]:
RAW_X.describe()
# gps_height has -tive mean height.
Out[6]:
In [7]:
RAW_X.info()
# we have lots of object columns
In [8]:
aa = RAW_X.isnull().sum()
bb = RAW_X.dtypes
cc = pd.DataFrame({'null_values': aa, 'data_types': bb})
cc
# We can see that `scheme name` has 28K records mission data.
Out[8]:
In [9]:
cc[cc.null_values > 0]
# Taking a closer look at the columns with missing data.
Out[9]:
Note: As you can see scheme_name
has too many null values to fill in. Also public_meeting
and permit
are shown as object due to presence of None(numpy.na) values.
In [10]:
# INT(Date/Float)
cols_ints = '''amount_tsh
gps_height
longitude
latitude
num_private
region_code
district_code
population
construction_year'''.splitlines()
# BOOL
cols_bool = 'public_meeting permit'.split()
# DATE
cols_date = ['date_recorded']
# OTHER
cols_other = sorted([ _ for _ in RAW_X.columns if _ not in cols_bool + cols_date + cols_ints ])
print('Int Cols: ', len(cols_ints))
print('Bool Cols: ', len(cols_bool))
print('Date Cols: ', len(cols_date))
print('Othr Cols:', len(cols_other))
In [11]:
RAW_X[cols_bool].head(10).T
# Note: presence of NaN
Out[11]:
In [12]:
RAW_X[cols_date].head(10).T
# Note: In RAW_X.info, this columns is shown as object.
Out[12]:
In [13]:
RAW_X[cols_ints].head(10)
# gps_height, amount_tsh, num_private, populaiton, construction_year - all have zeros
# As Tarrifa, our data source(provider) is a mobile platform for collecting data from users. We can consider
# when user/system did not correct details - they might have opted for zero instead of None values.
Out[13]:
Note: amount_tsh, gps_height, population, constructrion_year
values have zero's which is certainly not possible. For exampel if population is zero, logically there should not be any water pumps(issues) or region code and logically no person to add this record from Tarrifa. We can expect that zero, are simmilar to null values in object columns, given that data is collect from Tarrifa which is manually added entries/compaints
In [14]:
RAW_X[cols_ints].dtypes
Out[14]:
In [15]:
RAW_X[cols_ints].describe().T
# num_private - mostly seems to hold zero as per columns 25%, 50%, 75% quartiles
# gps_height - seems to had a sudden jump in 75% quartile
# construction_year - year cannot be zero.
Out[15]:
In [16]:
cols_ints
Out[16]:
In [17]:
# box plot for int columns
f, axarr = plt.subplots(3, 3, figsize=(12, 10))
counter = 0
for x in range(3):
for y in range(3):
col_name = cols_ints[counter]
counter += 1
ax = axarr[x][y]
# ax.set_title(col_name.lower(), fontsize=9)
sns.boxplot(RAW_X[col_name], ax=ax)
# gps_height: As expected seems have some suddent spikes.
# logitude: We can see some dots at Zero. completely out of its box plot(outlier/wrong data)
# amount_tsh: As expected from percentiles shown in describe, there are high increase in 75% quartile
# construction_year: one value box plot
In [18]:
_ = sns.pairplot(RAW_X[cols_ints])
# logitude suffers from zero(wrong) value
In [19]:
### gps_height
### longitude
### latitude
plt.figure(figsize=(12, 3))
g = sns.pairplot(RAW_X[['latitude', 'longitude', 'gps_height']])
Note:
* `longitude, gps_height` seem to hold lots of zeros
In [20]:
RAW_X[cols_other].head().T
Out[20]:
In [21]:
values_counts_bag = [len(RAW_X[column].value_counts()) for column in cols_other]
Example of how np-log transforms data
>>> np.log([0.001, 0.01, 0.1, 1, 10, 100, 1000])
array([-6.90775528, -4.60517019, -2.30258509, 0. , 2.30258509,
4.60517019, 6.90775528])
As you can see in np-log example, we can learn that when a list of values vary significantly(exponentially) then their logarithms moves linearly. As we(I) feel comfortable in studying linear plot and linear information, we did a np.log to values counts.
In [22]:
cols_vc_dataframe = pd.DataFrame(np.log(values_counts_bag), index=cols_other, columns=['Value Counts'])
In [23]:
print('Values Counts:', values_counts_bag)
print('\nLog of Values Counts:', cols_vc_dataframe.T.values)
plt.figure(figsize=(12, 3))
_ = sns.distplot(cols_vc_dataframe.T.values, hist=True, kde=False)
_ = plt.title('Historgram of Object Feature`s (log2 of) Unique Values counts', fontsize=9)
_ = plt.xlabel('Features', fontsize=9)
In [24]:
cols_vc_dataframe.plot(kind='barh', figsize=(12, 6), fontsize=8)
_ = plt.plot((2, 2), (0, 38))
_ = plt.plot((4, 4), (0, 38), '-g')
_ = plt.plot((6, 6), (0, 38), '-r')
_ = plt.plot((8, 8), (0, 38), '-y')
print('We seem to have some special categories where value counts are high.')
_ = plt.title('Comparing Unique Values in Object Columns', fontsize=9)
_ = plt.xlabel('Log of Unique Values', fontsize=9)
Note:: wpt_name, ward, subvillage, schema_name, installer, funder has lots of categorical values
In [25]:
sam_dataframe_cols_value_count_analysis(RAW_X,
cols_other,
plot_col_vc_limit=55,
x_plots_limit=11,
y_plots_limit=2,
show_percentages=False)
# we can observe strong similarities in the information
Note: In the above plot, we have two empty plot because of the generic function we created to show plots nicely. No other issues.
In [26]:
cols_vc_limit_fraction = 0.01
cols_vc_limit_value = RAW_X.shape[0] * cols_vc_limit_fraction
cols_vc_log_limit = np.log(cols_vc_limit_value)
print('Total Number of Records:', RAW_X.shape[0], '- Log val is:', np.log(RAW_X.shape[0]))
print('%s percent of Number of Records:' % (cols_vc_limit_fraction * 100),\
cols_vc_limit_value,\
' - Log val is:', cols_vc_log_limit)
Here in this project, cols_other_check
refers to list of columns for which caution check is considered. Reason for this check is, we would need more data to explain other columns & target cols with respect to it.
Lets consider these columns with more 5% of values as non categorical values and since our problem statement is choosing which category, we will try to minimise the category and see how our performance changes(improves or not)
To begin we will consider that those categories with more than cols_vc_limit_fraction
percentage as the upper limit allowed. Any column with other data will pruged to become some to other information
In [27]:
cols_other_check = []
for col, vc in cols_vc_dataframe.iterrows():
if col in cols_other:
if float(vc) > cols_vc_log_limit:
cols_other_check.append(col)
# print(col.upper())
# print(str(vc).splitlines()[0])
# print('---')
# print('Above shown are log transformed value counts')
print('Columns we need to moderate are:', cols_other_check)
As printed above, we will work on these columns to check if we can reduce the number of categories to a optimum count so that our models can learn patterns in better way.
In [28]:
plt.figure(figsize=(12, 3))
_ = sns.distplot(RAW_X.gps_height, hist=True, kde=False, rug=False)
# lots of zeros
In [29]:
plt.figure(figsize=(12, 3))
_ = sns.distplot(RAW_X.population, hist=True, kde=False, rug=False)
# lots of zeros
Checking logitude and latitude
In [30]:
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)
_ = plt.xlabel('longitude')
_ = plt.ylabel('latitude')
# longitude has zero issues
In [31]:
aa = RAW_X['longitude latitude region'.split()].copy()
bb = aa.groupby(by=['region']).mean()
bb.columns = ['longitude_mean', 'latitude_mean']
cc = aa.groupby(by=['region']).min()
cc.columns = ['longitude_min', 'latitude_min']
dd = aa.groupby(by=['region']).max()
dd.columns = ['longitude_max', 'latitude_max']
abcd = bb.join(cc).join(dd)[['latitude_max',
'latitude_mean',
'latitude_min',
'longitude_max',
'longitude_mean',
'longitude_min']].copy()
_ = abcd.plot(figsize=(12, 3))
In [32]:
_ = sns.jointplot(x='gps_height', y='population', data=RAW_X)
_ = plt.xlabel('gps_height')
_ = plt.ylabel('population')
Columns with Simmilar information
In [33]:
print('Extraction Type:\n', RAW_X.extraction_type.unique().tolist())
print('\n')
print('Extraction Type Class:\n', RAW_X.extraction_type_class.unique().tolist())
print('\n')
print('Extraction Type Group:\n', RAW_X.extraction_type_group.unique().tolist())
In [34]:
print('Management:\n', RAW_X.management.unique().tolist())
print('\n')
print('Management Group:\n', RAW_X.management_group.unique().tolist())
In [35]:
RAW_X[RAW_X.longitude < 5]['region'].unique().tolist()
Out[35]:
In [36]:
RAW_X[RAW_X.latitude < 5]['region'].unique().tolist()
Out[36]:
If we the Geographical Map of Aftica, we can clearly see that these values are not correct.
In [37]:
df = pd.DataFrame.join(RAW_X, RAW_y)
# this is to count
df['sam'] = 1
df.pivot_table(df, index=['status_group'], columns=['basin'], aggfunc='count')['sam'].T
Out[37]:
In [38]:
_ = sns.heatmap(df.pivot_table(df, index=['status_group'], columns=['basin'], aggfunc='count')['sam'].T)
In [39]:
df.pivot_table(df, index=['status_group'], columns=['region'], aggfunc='count')['sam'].T
Out[39]:
In [40]:
aa = RAW_X[['region', 'gps_height']].groupby(['region']).min()
aa.columns = ['min']
bb = RAW_X[['region', 'gps_height']].groupby(['region']).mean()
bb.columns = ['mean']
cc = RAW_X[['region', 'gps_height']].groupby(['region']).max()
cc.columns = ['max']
_ = aa.join(bb).join(cc).plot(figsize=(12, 3))
In [41]:
aa
# 11456.7 Dodoma
Out[41]:
Few columns which seems to hold simmilar kind of information
Geo Location information: All following parameter are availble for same reason, to find the address.
Compared to all other geo columns regions
columns has complete data.
In [42]:
# clean up for removing any imports other defined function & variables
for each in dir():
if each not in crazy_list:
del each
In [43]:
X, y, TEST_X = sam_pickle_load(prefix="tmp/Iteration2_final_")
In [44]:
_ = sns.jointplot(x='longitude', y='latitude', data=X)
_ = plt.xlabel('longitude')
_ = plt.ylabel('latitude')
# longitude has zero issues