In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
import pandas as pd
http://statsmodels.sourceforge.net/
Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests.
An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator.
Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python.
In [2]:
import pandas as pd
train = pd.read_csv('../data/tatanic_train.csv',\
sep = ",", header=0)
test = pd.read_csv('../data/tatanic_test.csv',\
sep = ",", header=0)
You can easily explore a DataFrame
In [5]:
train.head()
Out[5]:
In [6]:
train.describe()
Out[6]:
In [8]:
train.shape#, len(train)
#train.columns
Out[8]:
In [11]:
# Passengers that survived vs passengers that passed away
train["Survived"][:3]
Out[11]:
In [10]:
# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()
Out[10]:
In [9]:
# As proportions
train["Survived"].value_counts(normalize = True)
Out[9]:
In [10]:
train['Sex'].value_counts()
Out[10]:
In [12]:
train[train['Sex']=='female'][:3]#[train['Pclass'] == 3]
Out[12]:
In [13]:
# Males that survived vs males that passed away
train[["Survived", 'Fare']][train["Sex"] == 'male'][:3]
Out[13]:
In [15]:
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts()
Out[15]:
In [31]:
# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts()
Out[31]:
In [32]:
# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
Out[32]:
In [33]:
# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
Out[33]:
In [97]:
# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 5] = 1
train.Child[train.Age >= 5] = 0
print(train.Child[:3])
In [12]:
# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)
Out[12]:
In [24]:
# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)
Out[24]:
In [4]:
age = pd.cut(train['Age'], [0, 18, 80])
train.pivot_table('Survived', ['Sex', age], 'Pclass')
Out[4]:
In [5]:
fare = pd.qcut(train['Fare'], 2)
train.pivot_table('Survived', ['Sex', age], [fare, 'Pclass'])
Out[5]:
In [21]:
# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1
print(test_one.Survived[:3])
In [26]:
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
In [98]:
df = pd.read_csv('../data/tianya_bbs_threads_list.txt',\
sep = "\t", names = ['title','link', \
'author','author_page',\
'click','reply','time'])
df[:2]
Out[98]:
In [15]:
# df=df.rename(columns = {0:'title', 1:'link', \
# 2:'author',3:'author_page',\
# 4:'click', 5:'reply', 6:'time'})
# df[:5]
In [99]:
da = pd.read_csv('../data/tianya_bbs_threads_author_info.txt',
sep = "\t", names = ['author_page','followed_num',\
'fans_num','post_num', \
'comment_num'])
da[:2]
Out[99]:
In [19]:
# da=da.rename(columns = {0:'author_page', 1:'followed_num',\
# 2:'fans_num',3:'post_num', \
# 4:'comment_num'})
# # da[:5]
In [101]:
data = pd.concat([df,da], axis=1)
len(data)
Out[101]:
In [102]:
data[:3]
Out[102]:
In [103]:
type(data.time[0])
Out[103]:
In [104]:
# extract date from datetime
# date = map(lambda x: x[:10], data.time)
date = [i[:10] for i in data.time]
#date = [i[:10] for i in data.time]
data['date'] = pd.to_datetime(date)
In [107]:
data.date[:3]
Out[107]:
In [108]:
# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])
Out[108]:
In [109]:
data.describe()
#data.head()
Out[109]:
http://statsmodels.sourceforge.net/
Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests.
An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator.
Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python.
In [35]:
import statsmodels.api as sm
In [38]:
data.describe()
Out[38]:
In [39]:
import numpy as np
np.mean(data.click), np.std(data.click), np.sum(data.click)
Out[39]:
In [40]:
# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, \
weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum
Out[40]:
In [41]:
# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum
Out[41]:
In [163]:
np.median(data.click) # np.percentile
Out[163]:
In [42]:
plt.hist(data.click)
plt.show()
In [43]:
plt.hist(data.reply, color = 'green')
plt.show()
In [112]:
plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='red')
plt.show()
In [115]:
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()
In [47]:
# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()
In [116]:
def transformData(dat):
results = []
for i in dat:
if i != 'na':
results.append( int(i))
else:
results.append(0)
return results
In [117]:
data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
data.describe()
Out[117]:
In [50]:
import numpy as np
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1),
np.log(data.fans_num+1),\
np.log(data.followed_num + 1)],
labels = ['$Click$', '$Reply$', '$Fans$',\
'$Followed$'])
plt.show()
In [118]:
fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()
In [52]:
from pandas.tools import plotting
# fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply',\
'post_num','comment_num']])
plt.show()
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
In [53]:
import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', \
'post_num', 'comment_num'],
kind='reg')
Out[53]:
In [54]:
seaborn.pairplot(data, vars=['click', 'reply', 'post_num'],
kind='reg', hue='year')
Out[54]:
In [126]:
seaborn.lmplot(y='reply', x='click', data=data, #logx = True,
size = 5)
plt.show()
In [56]:
data.year.value_counts()
Out[56]:
In [127]:
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd
Out[127]:
In [128]:
dd.index
Out[128]:
In [129]:
dd_date_str = list(map(lambda x: str(x) +'-01-01', dd.index))
dd_date_str
Out[129]:
In [130]:
dd_date = pd.to_datetime(list(dd_date_str))
dd_date
Out[130]:
In [131]:
plt.plot(dd_date, dd.year, 'r-o')
plt.show()
In [132]:
ds = dd.cumsum()
ds
Out[132]:
In [133]:
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
def getDate(dat):
dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index))
dat_date = pd.to_datetime(dat_date_str)
return dat_date
ds.date = getDate(ds)
dd.date = getDate(dd)
plt.plot(ds.date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd.date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
In [137]:
dg = data.groupby('year').sum()
dg
Out[137]:
In [138]:
dgs = dg.cumsum()
dgs
Out[138]:
In [139]:
def getDate(dat):
dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index))
dat_date = pd.to_datetime(dat_date_str)
return dat_date
dg.date = getDate(dg)
In [140]:
fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')
plt.yscale('log')
plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()
In [141]:
data.groupby('year')['click'].sum()
Out[141]:
In [142]:
data.groupby('year')['click'].mean()
Out[142]:
In [143]:
repost = []
for i in df.title:
if u'转载' in i:
repost.append(1)
else:
repost.append(0)
In [145]:
df['repost'] = repost
In [146]:
df.groupby('repost').median()
Out[146]:
In [147]:
df['click'][df['repost']==0][:5]
Out[147]:
In [148]:
df['click'][df['repost']==1][:5]
Out[148]:
In [152]:
from scipy import stats
stats.ttest_ind(np.log(df.click+1), df.repost)
Out[152]:
In [154]:
sm.stats.ttest_ind(np.log(df.click+1), df.repost)
# test statistic, pvalue and degrees of freedom
Out[154]:
https://en.wikipedia.org/wiki/Chi-squared_test
reject the null hypothesis that the data are independent
.assumption of independent normally distributed data
, which is valid in many cases due to the central limit theorem
. sum of squared errors
, or through the sample variance
.Suppose there is a city of 1 million residents with four neighborhoods
: A, B, C, and D.
A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar".
The null hypothesis
is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as:
A | B | C | D | Total | |
---|---|---|---|---|---|
White collar | 90 | 60 | 104 | 95 | 349 |
Blue collar | 30 | 50 | 51 | 20 | 151 |
No coloar | 30 | 40 | 45 | 35 | 150 |
Total | 150 | 150 | 200 | 150 | 650 |
white-collar workers in neighborhood A
to be$ \frac{150}{650} \frac{349}{650} 650 = 80.54 $
Then in that "cell" of the table, we have
$\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$.
The sum of these quantities over all of the cells is the test statistic
.
Under the null hypothesis, it has approximately a chi-square distribution whose number of degrees of freedom
are
$ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $
If the test statistic is improbably large according to that chi-square distribution, then one rejects the null hypothesis of independence.
Parameters:
In [428]:
from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], \
f_exp=[16, 16, 16, 16, 16, 8])
Out[428]:
In [155]:
from scipy.stats import chi2
# p_value = chi2.sf(chi_statistic, df)
print(chi2.sf(3.5, 5))
print(1 - chi2.cdf(3.5,5))
In [157]:
# np.corrcoef(data.click, data.reply)
np.corrcoef(np.log(data.click+1), \
np.log(data.reply+1))
Out[157]:
In [383]:
data.corr()
Out[383]:
In [13]:
plt.plot(df.click, df.reply, 'r-o')
plt.show()
In [16]:
plt.plot(df.click, df.reply, 'gs')
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()
In [66]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [67]:
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', \
data=dat).fit()
有些使用windows的同学无法运行上述代码:
https://groups.google.com/forum/#!topic/pystatsmodels/KcSzNqDxv-Q
In [21]:
# Inspect the results
print results.summary()
In [68]:
reg = smf.ols('reply ~ click + followed_num', \
data=data).fit()
In [138]:
reg.summary()
In [60]:
reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) \
+np.log(followed_num+1)+month', data=data).fit()
print reg1.summary()
In [209]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(reg1, fig = fig)
plt.show()
In [429]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore = sm.datasets.get_rdataset("Moore", "car",
cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
"partner_status"}) # make name pythonic
In [434]:
data[:5]
Out[434]: