In [418]:
%matplotlib inline
import matplotlib.pyplot as plt
In [420]:
import pandas as pd
In [1]:
# Import the Pandas library
import pandas as pd
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
#Print the `head` of the train and test dataframes
print(train.head())
print(test.head())
In [3]:
train.to_csv('/Users/chengjun/github/cjc2016/data/tatanic_train.csv')
test.to_csv('/Users/chengjun/github/cjc2016/data/tatanic_test.csv')
You can easily explore a DataFrame
In [23]:
import pandas as pd
train = pd.read_csv('/Users/chengjun/github/cjc2016/data/tatanic_train.csv', sep = ",", header=0)
In [24]:
train.head()
Out[24]:
In [25]:
train.describe()
Out[25]:
In [26]:
train.shape
Out[26]:
In [27]:
# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()
Out[27]:
In [28]:
# As proportions
train["Survived"].value_counts(normalize = True)
Out[28]:
In [29]:
train['Sex'].value_counts()
Out[29]:
In [30]:
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts()
Out[30]:
In [31]:
# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts()
Out[31]:
In [32]:
# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
Out[32]:
In [33]:
# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
Out[33]:
In [34]:
# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 18] = 1
train.Child[train.Age >= 18] = 0
print train.Child[:3]
In [23]:
# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)
Out[23]:
In [24]:
# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)
Out[24]:
In [25]:
# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1
print test_one.Survived[:3]
In [26]:
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
In [14]:
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:2]
Out[14]:
In [15]:
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:5]
Out[15]:
In [16]:
da = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_author_info.txt', sep = "\t", header=None)
da[:2]
Out[16]:
In [17]:
da=da.rename(columns = {0:'author_page', 1:'followed_num', 2:'fans_num',3:'post_num', 4:'comment_num'})
da[:5]
Out[17]:
In [18]:
data = pd.concat([df,da], axis=1)
len(data)
Out[18]:
In [19]:
data[:3]
Out[19]:
In [20]:
type(data.time[0])
Out[20]:
In [21]:
# extract date from datetime
date = map(lambda x: x[:10], data.time)
data['date'] = pd.to_datetime(date)
In [22]:
# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])
Out[22]:
In [440]:
data[:3]
Out[440]:
http://statsmodels.sourceforge.net/
Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests.
An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator.
Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python.
In [325]:
import statsmodels.api as sm
In [326]:
' '.join(dir(sm))
Out[326]:
In [327]:
' '.join(dir(sm.stats))
Out[327]:
In [370]:
data.describe()
Out[370]:
In [162]:
import numpy as np
np.mean(data.click), np.std(data.click), np.sum(data.click)
Out[162]:
In [328]:
# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum
Out[328]:
In [329]:
# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum
Out[329]:
In [163]:
np.median(data.click)
Out[163]:
In [176]:
plt.hist(data.click)
plt.show()
In [186]:
plt.hist(data.reply, color = 'green')
plt.show()
In [185]:
plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='red')
plt.show()
In [170]:
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()
In [152]:
# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()
In [172]:
def transformData(dat):
results = []
for i in dat:
if i != 'na':
results.append( int(i))
else:
results.append(0)
return results
In [441]:
data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
In [379]:
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1),
np.log(data.fans_num+1), np.log(data.followed_num + 1)],
labels = ['$Click$', '$Reply$', '$Fans$', '$Followed$'])
plt.show()
In [380]:
fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()
In [381]:
' '.join(dir(data))
Out[381]:
In [454]:
from pandas.tools import plotting
#fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply', 'post_num','comment_num']])
plt.show()
In [450]:
' '.join(dir(plotting))
Out[450]:
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
In [455]:
import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', 'post_num', 'comment_num'],
kind='reg')
Out[455]:
In [458]:
seaborn.pairplot(data, vars=['click', 'reply', 'post_num'],
kind='reg', hue='year')
Out[458]:
In [457]:
seaborn.lmplot(y='reply', x='click', data=data)
Out[457]:
In [278]:
data.year.value_counts()
Out[278]:
In [292]:
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd
Out[292]:
In [351]:
dd.index
Out[351]:
In [362]:
dd_date_str = map(lambda x: str(x) +'-01-01', dd.index)
dd_date_str
Out[362]:
In [363]:
dd_date = pd.to_datetime(dd_date_str)
dd_date
Out[363]:
In [365]:
plt.plot(dd_date, dd.year, 'r-o')
plt.show()
In [384]:
ds = dd.cumsum()
ds
Out[384]:
In [397]:
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
ds.date = getDate(ds)
dd.date = getDate(dd)
plt.plot(ds_date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd_date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
In [398]:
dg = data.groupby('year').sum()
dg
Out[398]:
In [400]:
dgs = dg.cumsum()
dgs
Out[400]:
In [414]:
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
dg.date = getDate(dg)
In [413]:
fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')
plt.yscale('log')
plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()
In [276]:
data.groupby('year')['click'].sum()
Out[276]:
In [275]:
data.groupby('year')['click'].mean()
Out[275]:
In [35]:
repost = []
for i in df.title:
if u'转载' in i.decode('utf8'):
repost.append(1)
else:
repost.append(0)
In [39]:
df['repost'] = repost
In [42]:
df.groupby('repost').sum()
Out[42]:
In [48]:
df['click'][df['repost']==0][:5]
Out[48]:
In [49]:
df['click'][df['repost']==1][:5]
Out[49]:
In [323]:
from scipy import stats
stats.ttest_ind(data.click, data.repost)
Out[323]:
In [324]:
sm.stats.ttest_ind(data.click, data.reply)
# test statistic, pvalue and degrees of freedom
Out[324]:
https://en.wikipedia.org/wiki/Chi-squared_test
Suppose there is a city of 1 million residents with four neighborhoods: A, B, C, and D.
A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar".
The null hypothesis is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as:
A | B | C | D | Total | |
---|---|---|---|---|---|
White collar | 90 | 60 | 104 | 95 | 349 |
Blue collar | 30 | 50 | 51 | 20 | 151 |
No coloar | 30 | 40 | 45 | 35 | 150 |
Total | 150 | 150 | 200 | 150 | 650 |
$ \frac{150}{650} \frac{349}{650} 650 = 80.54 $
Then in that "cell" of the table, we have
$\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$.
The sum of these quantities over all of the cells is the test statistic.
Under the null hypothesis, it has approximately a chi-square distribution whose number of degrees of freedom are
$ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $
If the test statistic is improbably large according to that chi-square distribution, then one rejects the null hypothesis of independence.
Parameters:
In [428]:
from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])
Out[428]:
In [427]:
from scipy.stats import chisqprob, chi2
# p_value = chi2.sf(chi_statistic, df)
print chisqprob(3.94,1), 1 - chi2.cdf(3.94,1)
In [168]:
print np.corrcoef(data.click, data.reply)
In [383]:
data.corr()
Out[383]:
In [13]:
plt.plot(df.click, df.reply, 'r-o')
plt.show()
In [16]:
plt.plot(df.click, df.reply, 'gs')
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()
In [19]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [20]:
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
有些使用windows的同学无法运行上述代码:
https://groups.google.com/forum/#!topic/pystatsmodels/KcSzNqDxv-Q
In [21]:
# Inspect the results
print results.summary()
In [137]:
reg = smf.ols('reply ~ click + followed_num', data=data).fit()
In [138]:
print reg.summary()
In [207]:
reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) +np.log(followed_num+1)+month', data=data).fit()
print reg1.summary()
In [209]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(reg1, fig = fig)
plt.show()
In [429]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore = sm.datasets.get_rdataset("Moore", "car",
cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
"partner_status"}) # make name pythonic
In [434]:
data[:5]
Out[434]:
In [430]:
moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
data=data).fit()
In [432]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(moore_lm, fig = fig)
plt.show()
In [431]:
table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
print table