In [1]:
import pip
try:
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston
except:
pip.main(['install','--user','sklearn'])
The belowing code in azure can't work, because 403 error
boston = load_boston()
california = fetch_california_housing()
In local machine å¦èµ·çç¤ï¼ä¸è½½boston,californiaåæcsvï¼ç¶åä¼ å
¥azure
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
dataset.to_csv('boston.csv')
In [2]:
from azureml import Workspace
ws = Workspace(
workspace_id='3c64d445b4c840dca9683dd47522eba3',
authorization_token='JaC5E2q5FouX14JhvCmcvmzagqV63q0oVIbu2jblLBdQ5e5wf/Y24Ed6uXLvbSUgbiao5iF85C3uufYKQgXoNw==',
endpoint='https://studioapi.azureml.net'
)
ds = ws.datasets['boston.csv']
df = ds.to_dataframe()
In [3]:
df.head()
Out[3]:
In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
In [5]:
%matplotlib inline
# If you are using IPython, this will make the images available in the Notebook
以ä¸ä»£ç æ¯å®ç°æ£æåå¸
In [6]:
import matplotlib.mlab as mlab
x=np.linspace(-4,4,100)
for mean,variance in [(0,0.7),(1,1.5),(-2,0.5)]:
plt.plot(x,mlab.normpdf(x,mean,variance))
plt.show()
In [7]:
y=mlab.normpdf(x,0,1)
In [8]:
type(y)
Out[8]:
åå¼ç两ç§è®¡ç®æ¹æ³ SSE ----误差平æ¹å/忹差 ç´æ¹å¾çåæ³ï¼ç¬¬ä¸æ è¡¨ç¤ºåæ¹å·®å¨0-100以å çæå¤§çº¦350ç»
In [9]:
print(df['target'].mean())
print(np.mean(df['target']))
mean_expected_value=np.mean(df['target'])
In [10]:
df.ix[:,'target'].mean()
Out[10]:
In [11]:
Square_errors=pd.Series(mean_expected_value-df['target'])**2
SSE=np.sum(Square_errors)
print('Sum of Squared Errors (SSE): %f'%SSE)
In [12]:
density_plot=Square_errors.plot('hist')
æ åå æ ååè¿å,åå¼ä¸º0,æ¹å·®ä¸º1
In [13]:
def standardize(x):
return (x-np.mean(x))/np.std(x)
In [15]:
standardize_target=standardize(df['target'])
In [16]:
standardize_target.std()
Out[16]:
In [17]:
standardize_target.mean()
Out[17]:
è¿ä¸ªå½æ°è®¡ç®å ±åæ§
In [21]:
def covariance(variable_1, variable_2, bias=0):
observations = float(len(variable_1))
return np.sum((variable_1 - np.mean(variable_1)) * (variable_2 - np.mean(variable_2)))/(observations-min(bias,1))
è¿ä¸ªå½æ°è®¡ç®ç¸å ³æ§,åºå«å°±æ¯è¾å ¥ç»è¿æ åå
In [22]:
def correlation(var1,var2,bias=0):
return covariance(standardize(var1), standardize(var2),bias)
In [20]:
from scipy.stats.stats import pearsonr
print ('Our correlation estimation: %0.5f' % (correlation(df['RM'], df['target'])))
print ('Correlation from Scipy pearsonr estimation: %0.5f' % pearsonr(df['RM'], df['target'])[0])
In [23]:
print(pearsonr(df['RM'],df['target']))
Let's graph what happens when we correlate two variables. Using a scatterplot, we can easily visualize the two involved variables. A scatterplot is a graph where the values of two variables are treated as Cartesian coordinates; thus, for every (x, y) value a point is represented in the graph
In [28]:
x_range = [df['RM'].min(),df['RM'].max()]
y_range = [df['target'].min(),df['target'].max()]
scatter_plot = df.plot(kind='scatter', x='RM', y='target',xlim=x_range, ylim=y_range)
meanY = scatter_plot.plot(x_range, [df['target'].mean(),df['target'].mean()], '--' , color='red', linewidth=1)
meanX = scatter_plot.plot([df['RM'].mean(),df['RM'].mean()], y_range, '--', color='red', linewidth=1)
The scatterplot also plots the average value for both the target and the predictor variables as dashed lines. This divides the plot into four quadrants. If we compare it with the previous covariance and correlation formulas, we can understand why the correlation value was close to 1: in the bottom-right and in top-left quadrants, there are just a few mismatching points where one of variables is above its average and the other is below its own.A perfect match (correlation values of 1 or -1) is possible only when the points are in a straight line (and all points are therefore concentrated in the right-uppermost and left-lowermost quadrants). Thus, correlation is a measure of linear association, of how close to a straight line your points are. Ideally, having all your points on a single line favors a perfect mapping of your predictor variable to your target.