In [1]:
%pylab
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics as smg
import pandas as pd
from pandas.plotting import parallel_coordinates, scatter_matrix
from scipy import stats
In [2]:
F, p = stats.f_oneway([1,2,3],[5,6,7])
print(F, p)
F, p = stats.f_oneway([25.6636147577, 26.8147042254, 26.5087485812, 26.0110693572, 26.1982930499, 25.0162178218, 25.4738536463, 25.7626961169, 26.2413388405, 26.6684925808],
[26.9368238908, 26.7905458624, 26.0659696128, 25.8725323008, 26.67954654, 26.9751683032, 26.0701459549, 26.1627538932, 26.9750950622, 25.6773437008],
[27.0405788466, 26.7461246306, 28.1998587517, 26.180994282, 27.4699458762, 26.2699015397, 26.1013955748, 27.9366444862, 26.1518355511, 26.4466845405])
print(F, p)
In [3]:
# jeu de données de voitures http://lib.stat.cmu.edu/DASL/Datafiles/Cars.html
data = pd.read_csv('donnees/cars.txt')
data.info()
In [4]:
# coefficient de corrélation linéaire de Pearson
pearsonCorr = data.corr()
names = pearsonCorr.index.tolist()
pearsonCorr
Out[4]:
In [5]:
plt.figure()
plt.matshow(pearsonCorr)
plt.clim(-1.,1.)
plt.colorbar()
plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)
Out[5]:
In [6]:
spearmanCorr = data.corr('spearman')
names = spearmanCorr.index.tolist()
spearmanCorr
Out[6]:
In [7]:
plt.figure()
plt.matshow(spearmanCorr)
plt.clim(-1.,1.)
plt.colorbar()
plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)
Out[7]:
In [8]:
pearsonCorr-spearmanCorr
Out[8]:
In [9]:
plt.figure()
plt.matshow(pearsonCorr-spearmanCorr)
plt.colorbar()
plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)
Out[9]:
In [10]:
plt.figure()
scatter_matrix(data[names], diagonal='kde')
plt.figure()
plt.scatter(data.Weight, data.Horsepower)
plt.xlabel('Weight')
plt.ylabel('Horsepower')
Out[10]:
In [11]:
model = smf.ols('MPG ~ Horsepower', data = data)
results = model.fit()
print(results.summary())
plt.figure()
smg.regressionplots.plot_fit(results,1)
plt.figure()
#smg.regressionplots.plot_regress_exog(results, 'Horsepower')
plt.figure()
plt.hist(results.resid)
plt.title('test normalité: {}'.format(stats.normaltest(results.resid)))#, stats.jarque_bera(results.resid)
plt.figure()
plt.scatter(results.fittedvalues, results.resid)
plt.xlabel('Valeurs prédites')
plt.ylabel('Résidus')
plt.grid()
In [12]:
model = smf.ols('MPG ~ Country', data = data)
results = model.fit()
print(results.summary())
dataByCountry = []
for name, grouped in data.groupby(['Country']):
dataByCountry.append(grouped['MPG'].tolist())
F, p = stats.f_oneway(*dataByCountry)
print(data[['Country','MPG']].groupby(['Country']).describe())
print('Résultats de l\'ANOVA',F, p)
In [13]:
# exemple de création des variables binaires correspondantes
for c in data.Country.unique():
data[c] = (data['Country'] == c)
data['US']=data['U.S.']
model = smf.ols('MPG ~ Germany + Italy + Japan + Sweden + US', data = data)
results = model.fit()
print(results.summary())
In [14]:
model = smf.ols('MPG ~ Weight + Drive_Ratio + Horsepower + Displacement + Cylinders', data = data) # data = data[1:]
#C(Cylinders)
results = model.fit()
print(results.summary())
plt.figure()
smg.regressionplots.plot_fit(results,1)
plt.figure()
#smg.regressionplots.plot_regress_exog(results, 'Horsepower')
plt.figure()
plt.hist(results.resid)
plt.title('test normalité: {}'.format(stats.normaltest(results.resid)))#, stats.jarque_bera(results.resid)
plt.figure()
plt.scatter(results.fittedvalues, results.resid)
plt.xlabel('Valeurs prédites')
plt.ylabel('Résidus')
plt.grid()
In [15]:
model = smf.ols('MPG ~ Weight + Drive_Ratio', data = data)
#C(Cylinders)
results = model.fit()
print(results.summary())
plt.figure()
plt.scatter(results.fittedvalues, results.resid)
plt.xlabel('Valeurs prédites')
plt.ylabel('Résidus')
plt.grid()
In [16]:
# jeux de donnees cars
data['MPG0'] = data['MPG']>data['MPG'].median()
#data.describe()
data[['Country','Car','MPG0']].describe()
Out[16]:
In [17]:
model = sm.Logit(data['MPG0'], data[['Weight', 'Drive_Ratio', 'Horsepower', 'Displacement', 'Cylinders']])
results = model.fit()
print(results.summary())
In [18]:
model = sm.MNLogit(data['Country'], data[['Weight', 'Horsepower', 'Displacement', 'Cylinders']])
results = model.fit()
print(results.summary())
In [21]:
# jeu de donnees autos
autos = pd.read_csv('donnees/autos.txt', delimiter='\t')
autos.describe()
autos.info()
autos.symboling.unique()
#autos['fuel-type'].unique()
Out[21]:
In [22]:
model = sm.MNLogit(autos['symboling'], autos[['price', 'length', 'width', 'height', 'engine-size']])
results = model.fit()
print(results.summary())
In [23]:
# Travel Mode Choice http://www.statsmodels.org/stable/datasets/generated/modechoice.html
# mode =
# 1 - air
# 2 - train
# 3 - bus
# 4 - car
modechoices = sm.datasets.modechoice.load_pandas()
#print(modechoices.data.info())
#modechoices.data.describe()
data = modechoices.data[modechoices.data['choice'] == 1.].copy()
#print(data.info(), data['mode'])
#parallel_coordinates(data, 'mode')
#res=scatter_matrix(data[['mode', 'ttme', 'invc', 'invt', 'gc', 'hinc', 'psize']], diagonal='kde')
In [24]:
print(data.corr())
data['car'] = (data['mode'] == 4)
model = sm.Logit(data['car'], data[['invc', 'hinc', 'psize']])
results = model.fit()
print(results.summary())
In [25]:
model = sm.MNLogit(modechoices.endog, modechoices.exog)
results = model.fit()
print(results.summary())
In [26]:
# autres? modele logit binaire (probabilite de marcher) et multinomial (choix de mode / gravite des accidents)