In [10]:
from sklearn.linear_model import LinearRegression # Linear Regressioon Model for the Regression Implementation
import pandas as pd # Importing the pandas libbrary for the Data Analysis tools
import numpy as np
dataframe = pd.read_csv("winequality-red.csv")
dataframe.describe() # Outputs the Dataframe in a way


Out[10]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000
mean 8.319637 0.527821 0.270976 2.538806 0.087467 15.874922 46.467792 0.996747 3.311113 0.658149 10.422983 5.636023
std 1.741096 0.179060 0.194801 1.409928 0.047065 10.460157 32.895324 0.001887 0.154386 0.169507 1.065668 0.807569
min 4.600000 0.120000 0.000000 0.900000 0.012000 1.000000 6.000000 0.990070 2.740000 0.330000 8.400000 3.000000
25% 7.100000 0.390000 0.090000 1.900000 0.070000 7.000000 22.000000 0.995600 3.210000 0.550000 9.500000 5.000000
50% 7.900000 0.520000 0.260000 2.200000 0.079000 14.000000 38.000000 0.996750 3.310000 0.620000 10.200000 6.000000
75% 9.200000 0.640000 0.420000 2.600000 0.090000 21.000000 62.000000 0.997835 3.400000 0.730000 11.100000 6.000000
max 15.900000 1.580000 1.000000 15.500000 0.611000 72.000000 289.000000 1.003690 4.010000 2.000000 14.900000 8.000000

In [11]:
# Creating the Quality and the Alchol relation Matplotlib Graphc
from matplotlib import pyplot as plt
plt1 = plt
plt1.scatter(dataframe["alcohol"],dataframe["quality"])
plt1.xlabel("Alcohol")
plt1.ylabel("Quality")
plt1.title("Alchol Against Quality")
plt1.show()



In [12]:
plt2 = plt
plt2.scatter(dataframe["volatile acidity"],dataframe["quality"])
plt2.xlabel("Volatile Acidity")
plt2.ylabel("Quality")
plt2.title("Volatile Acidity Against Quality")
plt2.show()
# This reveals the negative relation ship between the Volatile Acidity and the Quality



In [30]:
corr_matrix = dataframe.corr().abs() # gets the absolute correlation betwee the data frames and otehrs.
# os = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
#                  .stack()
#                  .order(ascending=False))
#print(corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack())
# os = sorted(corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack())
# print(os[-1])
# # print(max(dataframe.corr())) # This gives use the maximum correlation value
# print(min(dataframe.corr()))
print(dataframe.corr().head(1))


               fixed acidity  volatile acidity  citric acid  residual sugar  \
fixed acidity            1.0         -0.256131     0.671703        0.114777   

               chlorides  free sulfur dioxide  total sulfur dioxide   density  \
fixed acidity   0.093705            -0.153794             -0.113181  0.668047   

                     pH  sulphates   alcohol   quality  
fixed acidity -0.682978   0.183006 -0.061668  0.124052  

Fitting and evaluating the model


In [97]:
from sklearn.linear_model import LinearRegression # FOR LR
#from sklearn.cross_validation import train_test_split # Splits the data in the form of train and sets
# There is another method for the Selection of the train and the test splitting ! 
from sklearn.model_selection import train_test_split # This is for the training sets and samples .! 
from sklearn.model_selection import cross_val_score # For cross valdation ! 
plot2  = plt # Having the plot
X = dataframe[list(dataframe.columns)[:-1]] # All dataframes except the last one ( which is the quality)
Y = dataframe['quality']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y) # This function gets the Training and the testin Samples Itself, 
# The Common division with 20 % of test data and the rest 80% of the training data .
regressor = LinearRegression() # This is the Linear Regression Model
# plot2.plot(X_test,Y_test, label  = "Training data")
regressor.fit(X_train,Y_train) # Fitting the training data in the linear regression model
y_prediction = regressor.predict(X_test) # predicting the test data on teh basis of the provided testing data frame\
# The date frame is extracted from the csv file and splited basedon the splot data set . 
# The prediction will be used to compare the results with the data set given and then applied on the data
print(len(X_train),len(Y_train),len(X_test),len(Y_test))
# plot2.plot(X_train,Y_train,label = "Training Data")
# plot2.axis([0,20,0,20])
print ("R-Squared :",regressor.score(X_test,Y_test))
# plot2.show()
# SELF CREATED PREDICTION CHECK HOW MUCH WAS ACCURATE
#print(len(y_prediction),len(Y_train))

true_prediction = 0
for item,score in enumerate(y_prediction):
    # print(Y[item],round(Y[item]) == round(score) , score) # Add this to see the output
    if round(Y[item]) == round(score):
        true_prediction += 1
        print(Y[item],round(Y[item]) == round(score) , score)
print("true prediction : ",true_prediction)
print("Total Prediction :",len(Y)) # WORST PREDICTION BY THE ALGORITHM.


(1199, 1199, 400, 400)
('R-Squared :', 0.34496033007429583)
(400, 1199)
(5, True, 5.0599718721448426)
(6, True, 6.2657837604144895)
(5, True, 4.9727038268356587)
(5, True, 5.323512082677059)
(5, True, 5.3074819916226481)
(5, True, 5.2609894899444605)
(6, True, 6.0301594609802649)
(5, True, 5.286003480953827)
(6, True, 5.7254868431949397)
(5, True, 5.4598429890577229)
(6, True, 5.6645248506765178)
(6, True, 6.1854085776684826)
(5, True, 5.42502078698136)
(5, True, 5.2176516881745627)
(5, True, 4.9329340837462823)
(5, True, 5.3107250023877022)
(6, True, 5.6215925674320992)
(5, True, 5.2353299890560265)
(5, True, 5.0355395378374084)
(5, True, 5.3550144709336251)
(5, True, 5.0468328243943077)
(5, True, 5.1080685768624612)
(5, True, 5.32630950193888)
(5, True, 5.4590375072909723)
(5, True, 5.1619325512759318)
(5, True, 5.0825980576651908)
(5, True, 5.1812923091798879)
(5, True, 5.3267211308754554)
(6, True, 5.5026506845131209)
(5, True, 5.2388267257511565)
(6, True, 5.6178825335112315)
(6, True, 6.1434245557639855)
(6, True, 6.2503930376456012)
(5, True, 4.8269513337564049)
(5, True, 5.2723780686748967)
(6, True, 5.647350485476931)
(6, True, 6.1292555059980884)
(6, True, 5.7709940270675659)
(5, True, 5.0950491475425679)
(6, True, 5.5479263055751318)
(5, True, 5.4785274650286766)
(5, True, 5.3444784437015755)
(5, True, 5.3736861733570009)
(5, True, 4.8951522912136909)
(5, True, 5.1383030256284172)
(5, True, 5.3954268702608488)
(6, True, 6.1734079749153956)
(5, True, 4.9004372809111985)
(5, True, 5.3413237324993048)
(5, True, 5.152517831901438)
(5, True, 4.7297055537919235)
(6, True, 6.2353657409396135)
(5, True, 5.1827914299600337)
(6, True, 5.6614230221727198)
(5, True, 4.89435697687518)
(5, True, 5.2140147809828363)
(5, True, 5.2554965288092745)
(5, True, 5.3716394397210117)
(6, True, 5.6404329191767104)
(5, True, 5.2256068337936519)
(5, True, 4.6769436894587386)
(5, True, 4.9699368816117655)
(6, True, 6.2248230473092931)
(6, True, 5.6647410171763468)
(5, True, 5.4761486847470486)
(6, True, 5.5484743672280104)
(5, True, 5.3361848199461406)
(5, True, 5.254873601870365)
(5, True, 5.4980618814113278)
(5, True, 5.3976370072494717)
(5, True, 5.2344916033168936)
(5, True, 5.2609894899444605)
(6, True, 5.708767814134692)
(5, True, 4.9403910679819134)
(5, True, 5.0460061254176409)
(5, True, 5.0875098653999338)
(5, True, 5.4084478284848494)
(5, True, 5.2597849564836352)
(6, True, 5.5927759362784784)
(5, True, 5.1296089513457162)
(5, True, 5.3477413837906873)
(6, True, 5.8722355891982652)
(5, True, 5.1546843409189869)
(5, True, 5.0079315138860139)
(5, True, 5.3495030409595117)
(6, True, 5.8612251925426158)
(6, True, 6.3593045942870212)
(6, True, 6.3515812620292369)
(6, True, 6.4098739344811193)
(6, True, 5.7709940270675659)
(6, True, 5.9201363469081514)
(6, True, 6.1635017375901349)
(6, True, 5.7695168866679474)
(6, True, 5.5697369286492453)
(6, True, 5.6244200392643862)
(5, True, 5.1839840605191512)
(6, True, 5.8841215438332384)
(5, True, 5.4653614314705621)
(5, True, 5.237020926603968)
(5, True, 5.2843894253558545)
(6, True, 5.7266999783817383)
(6, True, 5.5078531281560821)
(6, True, 6.0809266953529715)
(6, True, 6.4446669686402593)
(6, True, 6.0756020101497015)
(5, True, 5.2921616021845779)
(5, True, 5.2904790776191035)
(6, True, 6.2605717068516853)
(6, True, 5.8658484631973664)
(5, True, 5.1418100590160769)
(5, True, 5.2808524395161776)
(6, True, 5.6239858965883691)
(5, True, 5.3452060720307912)
(5, True, 5.1296089513457162)
(6, True, 5.5196803911502244)
(6, True, 5.9214819052290188)
(6, True, 5.6000836563089642)
(6, True, 6.2355570735822141)
(6, True, 5.7571299256191821)
(6, True, 5.7658606682003608)
(5, True, 5.4391804145206919)
(6, True, 6.421816518267832)
(6, True, 5.6620304075269416)
(6, True, 5.6620304075269416)
(6, True, 5.556870799621791)
(5, True, 5.4160264807997827)
(6, True, 5.8383511841476796)
(6, True, 5.9758356695122856)
(6, True, 5.9764202310923942)
(5, True, 5.3292614713003132)
(6, True, 5.8694516195000475)
(6, True, 5.7665317965041609)
(6, True, 6.4354887170289743)
(6, True, 5.8394038377338333)
(6, True, 5.5078531281560821)
(6, True, 6.1316209013076843)
(5, True, 5.3494465985430963)
(6, True, 6.1168269060658886)
(5, True, 4.8582088116719291)
(5, True, 5.3382886717440208)
(5, True, 5.2172317057877748)
(7, True, 6.5475858796213959)
(6, True, 6.0349019593394608)
(5, True, 4.9056274275064915)
(6, True, 5.7548461752125277)
(5, True, 5.0436791389253983)
(6, True, 5.7719240420492106)
(6, True, 6.39914593959824)
(6, True, 6.1817544868769332)
(6, True, 5.8693855125450618)
(6, True, 5.6993243188764886)
(6, True, 5.8109572597893333)
(6, True, 6.2425294107566955)
(6, True, 5.7243110626532641)
(5, True, 5.2461628840025227)
(6, True, 5.9812402055911953)
(5, True, 5.0631817359449514)
('true prediction : ', 157)
('Total Prediction :', 1599)

In [98]:
plot2.scatter(Y_test,y_prediction) # Showing the ploted Accuracy for the model. as per predicted
# its better for predicting the wines ( thats a strange hing )
plot2.show()


CROSS VALIDATION SCORE PRODUCES BETTER ESTIMATES


In [89]:
X = dataframe[list(dataframe.columns)[:-1]] # All dataframes except the last one ( which is the quality)
Y = dataframe['quality']
regressor = LinearRegression() # model
scores = cross_val_score(regressor, X,Y, cv =5) # A model , Data exploratory data:X , data Response :Y
print(scores) # This generates some random number of Crossvalidation 
# Random sets of cross validation : you can choose the best one 
print(scores.mean()) # Printing the mean for the scores obtained from cross validation !
# print(scores.view)
#print(scores.key())
#regressor.predict()


[ 0.13200871  0.31858135  0.34955348  0.369145    0.2809196 ]
0.290041628842
<built-in method view of numpy.ndarray object at 0x1a1592ee90>

APPLYING GRADIENT DESCENT : BATCH , STOCHASTIC


In [3]:
import numpy as np # Numpy 
from sklearn.linear_model import LinearRegression # Linear Regression Model.
from sklearn.model_selection import cross_val_score # CROSS VALIDATION OF MODEL
from sklearn.linear_model import SGDRegressor # Gradient Descent : Stochastic :  updates using a single instance
from sklearn.datasets import load_boston # Already buitin data set for the loading of the boston 
from sklearn.preprocessing import StandardScaler # FOR FITTING AND TRANSFORMATION OF THE DATA PREPROCESSED . 
from sklearn.model_selection import train_test_split # For automatically having a data set in 4 lists [XT,XTRAIN,Y,YTRAIN]
#from sklearn.cross_validation import train_test_split
data = load_boston()
# Having the Train test spliting 
#print(data.target)
X_train,X_test, Y_train, Y_test = train_test_split(data.data,data.target)# data.data are exploratory variable, data.target is Y one

In [17]:
# #print(Y_train)
# # Having the scaled Features for the StandardScaler configuration  !
# X_Scaler = StandardScaler() # 
# Y_Scaler= StandardScaler() # Will be used to conver to scaler using the fit_transform stuff
# X_Train = X_Scaler.fit_transform(X_train) # Fitting the trainng data nd then transforming using the standard scaler
# print(X_Train) # DEBUGGIN
# print(np.array([Y_train])) #Debugging
# Y_Train = Y_Scaler.fit_transform(np.array([Y_train])) # Fitting and transforming the response data
# X_Test = X_Scaler.transform(X_test) # Transform the test using the X_test

# #Y_Test = Y_Scaler.transform(np.array([Y_test])) # Transforms the test using the Y_test

In [18]:
# regressor = SGDRegressor(loss = 'squared_loss')
# score = cross_val_score(regressor,  X_train , Y_train, cv = 5) # Having the SScore cross validation for the training data set
# print('Cross Validation r-squared Score: ',score)
# print("Average cross validation ",np.mean(score))
# regressor.fit_transform(X_train,Y_train) # Fiting and then transforming!
# print("Test Score : ",regressor.score(X_test,Y_test))

In [19]:
import sklearn
sklearn.__version__


Out[19]:
'0.19.0'

In [2]:
from sklearn.linear_model import SGDClassifier # Importing the classifiers
X = [[0., 0.], [1., 1.]] # Sample Exploratory edata set
Y = [0,1 ] # Sample Training Responses which would be matched ! 
clf = SGDClassifier(loss = "hinge", penalty = "l2") # loss --> hinge : soft margin LSVM, log --> logestic Regression , 
clf.fit(X,Y) # Classifier being used to fit the data
print(clf)
# Takes all the things bydefault  , the penalty is L2 , 
# GRADIENT DESCENT ON LINEAR REGRESSION BASED CLASSIFIER
clf.predict([[2,2]]) # PREDICTS THE INPUT DATASET X --> Y 
print(clf.coef_) # prints the coefficent for the Clf. # it has the model parameters  # Currently  two parameters


/Users/shafay.amjad/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
[[ 9.91080278  9.91080278]]

In [3]:
# For checking the intercepts of the CLassifier
print(clf.intercept_) # prints he intercept aka offset or bias  # This can be controlled using the fit_intercept  # Parameter


[-9.97004991]

In [5]:
# To get the signed distance fro mthe hyhper ploace to the specific point , its like : 
clf.decision_function([[2,2]]) # This is for the Linear modeling using a single variable >  ! ! ! ! !


Out[5]:
array([ 29.67316119])

In [8]:
#HAVING THE LOG AS LOSS
clf = SGDClassifier(loss = "log").fit(X,Y) # Fitting the module using the logestic Loss function regession on the gradient
clf.predict_proba([[1., 1.]]) # Gives a probability vector estimates on the Classifier Showing the result of matching.
# THe first one will produce almost zero , wile the second one would be 0.99 as its the matching element with the trained


Out[8]:
array([[  4.97248476e-07,   9.99999503e-01]])

In [ ]: