Import pandas and numpy
In [1]:
import pandas as pd
import numpy as np
from ggplot import *
import matplotlib
matplotlib.style.use('ggplot')
Test some python easter egg
In [2]:
import __hello__
In [3]:
import this
Data taken from Github: https://github.com/veekun/pokedex
In [6]:
path = "../pokedex/pokedex/data/csv/"
Load data
In [7]:
pokemons = pd.read_csv(path + "pokemon.csv",sep=",")
pokemon_species_names = pd.read_csv(path + "pokemon_species_names.csv",sep=",")
pokemon_species = pd.read_csv(path + "pokemon_species.csv",sep=",")
generation_names = pd.read_csv(path + "generation_names.csv", sep=",")
habitat = pd.read_csv(path + "pokemon_habitats.csv")
In [8]:
type(pokemons)
Out[8]:
In [9]:
pokemons.head(2)
Out[9]:
In [11]:
pokemons.columns
Out[11]:
In [12]:
pokemons.dtypes
Out[12]:
In [13]:
type(pokemons['identifier']), type(pokemons.identifier)
Out[13]:
In [14]:
pokemons.identifier.head(5)
Out[14]:
In [15]:
pokemons.ix[0]
Out[15]:
In [16]:
pokemons['identifier'].head(5)
Out[16]:
In [17]:
pokemons.ix[0,'weight']
Out[17]:
In [22]:
pokemons['new_column'] = 'MGA POGING POKEMON'
In [23]:
pokemons.head(2)
Out[23]:
In [24]:
pokemons.ix[1,'new_column'] = 'AY HINDI PALA POGI, HALAMAN PALA SILA.'
In [25]:
pokemons.head(2)
Out[25]:
Discover values of new_column
In [26]:
pd.unique(pokemons['new_column'])
Out[26]:
In [27]:
pokemons.describe()
Out[27]:
Get a more relevant stats for a more relevant columns
In [28]:
pokemons[['height','weight','base_experience']].describe()
Out[28]:
In [30]:
%matplotlib inline
pokemons[['height']].hist(bins=50,figsize=(15,5)), pokemons[['weight']].hist(bins=50,figsize=(15,5))
Out[30]:
In [63]:
pokemons[['height']].plot(kind='density',figsize=(15,5))
Out[63]:
In [31]:
filtered_pokemons = pokemons[pokemons['height']>=15]
filtered_pokemons.head(2)
Out[31]:
In [32]:
filtered_pokemons[['height']].hist(bins=20,figsize=(15,5))
Out[32]:
In [33]:
pokemon_species.columns, pokemons.columns
Out[33]:
In [34]:
pokemon_species.columns = [u'species_id', u'identifier', u'generation_id', u'evolves_from_species_id',\
u'evolution_chain_id', u'color_id', u'shape_id', u'habitat_id',\
u'gender_rate', u'capture_rate', u'base_happiness', u'is_baby',\
u'hatch_counter', u'has_gender_differences', u'growth_rate_id',\
u'forms_switchable', u'order', u'conquest_order']
In [35]:
pokemon_species_merged = pd.DataFrame()
pokemon_species_merged = pokemons.merge(pokemon_species,on=['species_id'],how='left')
In [36]:
pokemon_species_merged.head(2)
Out[36]:
In [37]:
pokemon_species_names_english = pokemon_species_names[pokemon_species_names.local_language_id==9]
pokemon_species_names_english.columns = [u'species_id', u'local_language_id', u'name', u'genus']
pokemon_species_merged_english = pokemon_species_merged.merge(pokemon_species_names_english,\
on=['species_id'],how='left')
In [38]:
pokemon_species_merged_english.columns
Out[38]:
In [39]:
pokemon_species_merged_agg = pokemon_species_merged_english.groupby("genus").agg({'id': np.count_nonzero, \
'height': np.mean,\
'weight': np.median})
pokemon_species_merged_agg.tail(5)
Out[39]:
In [40]:
pokemon_species_merged_english[pokemon_species_merged_english.genus=='Worm']
Out[40]:
In [41]:
pokemon_species_merged_english['id'][pokemon_species_merged_english.genus=='Worm'].count()
Out[41]:
In [42]:
pokemon_species_merged_agg.T
Out[42]:
In [44]:
test_data = pd.read_csv("../maprdata/pyspark-churn-prediction/data/churn-bigml-20.csv",sep=",")
train_data = pd.read_csv("../maprdata/pyspark-churn-prediction/data/churn-bigml-80.csv",sep=",")
In [54]:
test_data.columns
Out[54]:
In [64]:
# test_data_sample = test_data[[u'Total day calls', u'Total day charge', u'Total eve minutes',\
# u'State', u'International plan',\
# u'Churn']]
test_data_sample = test_data
In [65]:
#test_data_sample.describe().T
test_data.describe().T
Out[65]:
In [85]:
axs = pd.tools.plotting.scatter_matrix(test_data,figsize=(25,20))
# From http://stackoverflow.com/questions/23009509/how-to-modify-pandas-plotting-integration
def wrap(txt, width=8):
'''helper function to wrap text for long labels'''
import textwrap
return '\n'.join(textwrap.wrap(txt, width))
for ax in axs[:,0]: # the left boundary
ax.grid('off', axis='both')
ax.set_ylabel(wrap(ax.get_ylabel()), rotation=0, va='center', labelpad=20)
ax.set_yticks([])
for ax in axs[-1,:]: # the lower boundary
ax.grid('off', axis='both')
ax.set_xlabel(wrap(ax.get_xlabel()), rotation=90)
ax.set_xticks([])
In [97]:
test_data.head(2)
Out[97]:
In [107]:
test_data_features = test_data.drop(axis=1,labels=['State', 'Account length', 'Area code', 'International plan',\
'Total day minutes', 'Total eve minutes', 'Total night minutes', 'Total intl minutes',\
'Voice mail plan'])
In [108]:
test_data_features.columns
Out[108]:
In [110]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
In [111]:
test_predictors = test_data_features[[u'Number vmail messages', u'Total day calls', u'Total day charge',\
u'Total eve calls', u'Total eve charge', u'Total night calls',\
u'Total night charge', u'Total intl calls', u'Total intl charge',\
u'Customer service calls']]
test_label = test_data_features[[u'Churn']]
In [117]:
rf.fit(test_predictors.values, np.ravel(test_label.values))
Out[117]:
In [126]:
rf.score(test_predictors.values,np.ravel(test_label.values))
Out[126]:
In [139]:
test_data_sample[['Churn','Total day calls']].groupby("Churn").count()
Out[139]: