notebook.community

Edit and run



In [10]:

    
from sklearn.linear_model import LinearRegression # Linear Regressioon Model for the Regression Implementation
import pandas as pd # Importing the pandas libbrary for the Data Analysis tools
import numpy as np
dataframe = pd.read_csv("winequality-red.csv")
dataframe.describe() # Outputs the Dataframe in a way









    Out[10]:







  
    
      
      fixed acidity
      volatile acidity
      citric acid
      residual sugar
      chlorides
      free sulfur dioxide
      total sulfur dioxide
      density
      pH
      sulphates
      alcohol
      quality
    
  
  
    
      count
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
      1599.000000
    
    
      mean
      8.319637
      0.527821
      0.270976
      2.538806
      0.087467
      15.874922
      46.467792
      0.996747
      3.311113
      0.658149
      10.422983
      5.636023
    
    
      std
      1.741096
      0.179060
      0.194801
      1.409928
      0.047065
      10.460157
      32.895324
      0.001887
      0.154386
      0.169507
      1.065668
      0.807569
    
    
      min
      4.600000
      0.120000
      0.000000
      0.900000
      0.012000
      1.000000
      6.000000
      0.990070
      2.740000
      0.330000
      8.400000
      3.000000
    
    
      25%
      7.100000
      0.390000
      0.090000
      1.900000
      0.070000
      7.000000
      22.000000
      0.995600
      3.210000
      0.550000
      9.500000
      5.000000
    
    
      50%
      7.900000
      0.520000
      0.260000
      2.200000
      0.079000
      14.000000
      38.000000
      0.996750
      3.310000
      0.620000
      10.200000
      6.000000
    
    
      75%
      9.200000
      0.640000
      0.420000
      2.600000
      0.090000
      21.000000
      62.000000
      0.997835
      3.400000
      0.730000
      11.100000
      6.000000
    
    
      max
      15.900000
      1.580000
      1.000000
      15.500000
      0.611000
      72.000000
      289.000000
      1.003690
      4.010000
      2.000000
      14.900000
      8.000000



In [11]:

    
# Creating the Quality and the Alchol relation Matplotlib Graphc
from matplotlib import pyplot as plt
plt1 = plt
plt1.scatter(dataframe["alcohol"],dataframe["quality"])
plt1.xlabel("Alcohol")
plt1.ylabel("Quality")
plt1.title("Alchol Against Quality")
plt1.show()



In [12]:

    
plt2 = plt
plt2.scatter(dataframe["volatile acidity"],dataframe["quality"])
plt2.xlabel("Volatile Acidity")
plt2.ylabel("Quality")
plt2.title("Volatile Acidity Against Quality")
plt2.show()
# This reveals the negative relation ship between the Volatile Acidity and the Quality



In [30]:

    
corr_matrix = dataframe.corr().abs() # gets the absolute correlation betwee the data frames and otehrs.
# os = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
#                  .stack()
#                  .order(ascending=False))
#print(corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack())
# os = sorted(corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack())
# print(os[-1])
# # print(max(dataframe.corr())) # This gives use the maximum correlation value
# print(min(dataframe.corr()))
print(dataframe.corr().head(1))









    



               fixed acidity  volatile acidity  citric acid  residual sugar  \
fixed acidity            1.0         -0.256131     0.671703        0.114777   

               chlorides  free sulfur dioxide  total sulfur dioxide   density  \
fixed acidity   0.093705            -0.153794             -0.113181  0.668047   

                     pH  sulphates   alcohol   quality  
fixed acidity -0.682978   0.183006 -0.061668  0.124052

Fitting and evaluating the model



In [97]:

    
from sklearn.linear_model import LinearRegression # FOR LR
#from sklearn.cross_validation import train_test_split # Splits the data in the form of train and sets
# There is another method for the Selection of the train and the test splitting ! 
from sklearn.model_selection import train_test_split # This is for the training sets and samples .! 
from sklearn.model_selection import cross_val_score # For cross valdation ! 
plot2  = plt # Having the plot
X = dataframe[list(dataframe.columns)[:-1]] # All dataframes except the last one ( which is the quality)
Y = dataframe['quality']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y) # This function gets the Training and the testin Samples Itself, 
# The Common division with 20 % of test data and the rest 80% of the training data .
regressor = LinearRegression() # This is the Linear Regression Model
# plot2.plot(X_test,Y_test, label  = "Training data")
regressor.fit(X_train,Y_train) # Fitting the training data in the linear regression model
y_prediction = regressor.predict(X_test) # predicting the test data on teh basis of the provided testing data frame\
# The date frame is extracted from the csv file and splited basedon the splot data set . 
# The prediction will be used to compare the results with the data set given and then applied on the data
print(len(X_train),len(Y_train),len(X_test),len(Y_test))
# plot2.plot(X_train,Y_train,label = "Training Data")
# plot2.axis([0,20,0,20])
print ("R-Squared :",regressor.score(X_test,Y_test))
# plot2.show()
# SELF CREATED PREDICTION CHECK HOW MUCH WAS ACCURATE
#print(len(y_prediction),len(Y_train))

true_prediction = 0
for item,score in enumerate(y_prediction):
    # print(Y[item],round(Y[item]) == round(score) , score) # Add this to see the output
    if round(Y[item]) == round(score):
        true_prediction += 1
        print(Y[item],round(Y[item]) == round(score) , score)
print("true prediction : ",true_prediction)
print("Total Prediction :",len(Y)) # WORST PREDICTION BY THE ALGORITHM.









    



(1199, 1199, 400, 400)
('R-Squared :', 0.34496033007429583)
(400, 1199)






    












    



(5, True, 5.0599718721448426)
(6, True, 6.2657837604144895)
(5, True, 4.9727038268356587)
(5, True, 5.323512082677059)
(5, True, 5.3074819916226481)
(5, True, 5.2609894899444605)
(6, True, 6.0301594609802649)
(5, True, 5.286003480953827)
(6, True, 5.7254868431949397)
(5, True, 5.4598429890577229)
(6, True, 5.6645248506765178)
(6, True, 6.1854085776684826)
(5, True, 5.42502078698136)
(5, True, 5.2176516881745627)
(5, True, 4.9329340837462823)
(5, True, 5.3107250023877022)
(6, True, 5.6215925674320992)
(5, True, 5.2353299890560265)
(5, True, 5.0355395378374084)
(5, True, 5.3550144709336251)
(5, True, 5.0468328243943077)
(5, True, 5.1080685768624612)
(5, True, 5.32630950193888)
(5, True, 5.4590375072909723)
(5, True, 5.1619325512759318)
(5, True, 5.0825980576651908)
(5, True, 5.1812923091798879)
(5, True, 5.3267211308754554)
(6, True, 5.5026506845131209)
(5, True, 5.2388267257511565)
(6, True, 5.6178825335112315)
(6, True, 6.1434245557639855)
(6, True, 6.2503930376456012)
(5, True, 4.8269513337564049)
(5, True, 5.2723780686748967)
(6, True, 5.647350485476931)
(6, True, 6.1292555059980884)
(6, True, 5.7709940270675659)
(5, True, 5.0950491475425679)
(6, True, 5.5479263055751318)
(5, True, 5.4785274650286766)
(5, True, 5.3444784437015755)
(5, True, 5.3736861733570009)
(5, True, 4.8951522912136909)
(5, True, 5.1383030256284172)
(5, True, 5.3954268702608488)
(6, True, 6.1734079749153956)
(5, True, 4.9004372809111985)
(5, True, 5.3413237324993048)
(5, True, 5.152517831901438)
(5, True, 4.7297055537919235)
(6, True, 6.2353657409396135)
(5, True, 5.1827914299600337)
(6, True, 5.6614230221727198)
(5, True, 4.89435697687518)
(5, True, 5.2140147809828363)
(5, True, 5.2554965288092745)
(5, True, 5.3716394397210117)
(6, True, 5.6404329191767104)
(5, True, 5.2256068337936519)
(5, True, 4.6769436894587386)
(5, True, 4.9699368816117655)
(6, True, 6.2248230473092931)
(6, True, 5.6647410171763468)
(5, True, 5.4761486847470486)
(6, True, 5.5484743672280104)
(5, True, 5.3361848199461406)
(5, True, 5.254873601870365)
(5, True, 5.4980618814113278)
(5, True, 5.3976370072494717)
(5, True, 5.2344916033168936)
(5, True, 5.2609894899444605)
(6, True, 5.708767814134692)
(5, True, 4.9403910679819134)
(5, True, 5.0460061254176409)
(5, True, 5.0875098653999338)
(5, True, 5.4084478284848494)
(5, True, 5.2597849564836352)
(6, True, 5.5927759362784784)
(5, True, 5.1296089513457162)
(5, True, 5.3477413837906873)
(6, True, 5.8722355891982652)
(5, True, 5.1546843409189869)
(5, True, 5.0079315138860139)
(5, True, 5.3495030409595117)
(6, True, 5.8612251925426158)
(6, True, 6.3593045942870212)
(6, True, 6.3515812620292369)
(6, True, 6.4098739344811193)
(6, True, 5.7709940270675659)
(6, True, 5.9201363469081514)
(6, True, 6.1635017375901349)
(6, True, 5.7695168866679474)
(6, True, 5.5697369286492453)
(6, True, 5.6244200392643862)
(5, True, 5.1839840605191512)
(6, True, 5.8841215438332384)
(5, True, 5.4653614314705621)
(5, True, 5.237020926603968)
(5, True, 5.2843894253558545)
(6, True, 5.7266999783817383)
(6, True, 5.5078531281560821)
(6, True, 6.0809266953529715)
(6, True, 6.4446669686402593)
(6, True, 6.0756020101497015)
(5, True, 5.2921616021845779)
(5, True, 5.2904790776191035)
(6, True, 6.2605717068516853)
(6, True, 5.8658484631973664)
(5, True, 5.1418100590160769)
(5, True, 5.2808524395161776)
(6, True, 5.6239858965883691)
(5, True, 5.3452060720307912)
(5, True, 5.1296089513457162)
(6, True, 5.5196803911502244)
(6, True, 5.9214819052290188)
(6, True, 5.6000836563089642)
(6, True, 6.2355570735822141)
(6, True, 5.7571299256191821)
(6, True, 5.7658606682003608)
(5, True, 5.4391804145206919)
(6, True, 6.421816518267832)
(6, True, 5.6620304075269416)
(6, True, 5.6620304075269416)
(6, True, 5.556870799621791)
(5, True, 5.4160264807997827)
(6, True, 5.8383511841476796)
(6, True, 5.9758356695122856)
(6, True, 5.9764202310923942)
(5, True, 5.3292614713003132)
(6, True, 5.8694516195000475)
(6, True, 5.7665317965041609)
(6, True, 6.4354887170289743)
(6, True, 5.8394038377338333)
(6, True, 5.5078531281560821)
(6, True, 6.1316209013076843)
(5, True, 5.3494465985430963)
(6, True, 6.1168269060658886)
(5, True, 4.8582088116719291)
(5, True, 5.3382886717440208)
(5, True, 5.2172317057877748)
(7, True, 6.5475858796213959)
(6, True, 6.0349019593394608)
(5, True, 4.9056274275064915)
(6, True, 5.7548461752125277)
(5, True, 5.0436791389253983)
(6, True, 5.7719240420492106)
(6, True, 6.39914593959824)
(6, True, 6.1817544868769332)
(6, True, 5.8693855125450618)
(6, True, 5.6993243188764886)
(6, True, 5.8109572597893333)
(6, True, 6.2425294107566955)
(6, True, 5.7243110626532641)
(5, True, 5.2461628840025227)
(6, True, 5.9812402055911953)
(5, True, 5.0631817359449514)
('true prediction : ', 157)
('Total Prediction :', 1599)



In [98]:

    
plot2.scatter(Y_test,y_prediction) # Showing the ploted Accuracy for the model. as per predicted
# its better for predicting the wines ( thats a strange hing )
plot2.show()

CROSS VALIDATION SCORE PRODUCES BETTER ESTIMATES



In [89]:

    
X = dataframe[list(dataframe.columns)[:-1]] # All dataframes except the last one ( which is the quality)
Y = dataframe['quality']
regressor = LinearRegression() # model
scores = cross_val_score(regressor, X,Y, cv =5) # A model , Data exploratory data:X , data Response :Y
print(scores) # This generates some random number of Crossvalidation 
# Random sets of cross validation : you can choose the best one 
print(scores.mean()) # Printing the mean for the scores obtained from cross validation !
# print(scores.view)
#print(scores.key())
#regressor.predict()









    



[ 0.13200871  0.31858135  0.34955348  0.369145    0.2809196 ]
0.290041628842
<built-in method view of numpy.ndarray object at 0x1a1592ee90>

APPLYING GRADIENT DESCENT : BATCH , STOCHASTIC



In [3]:

    
import numpy as np # Numpy 
from sklearn.linear_model import LinearRegression # Linear Regression Model.
from sklearn.model_selection import cross_val_score # CROSS VALIDATION OF MODEL
from sklearn.linear_model import SGDRegressor # Gradient Descent : Stochastic :  updates using a single instance
from sklearn.datasets import load_boston # Already buitin data set for the loading of the boston 
from sklearn.preprocessing import StandardScaler # FOR FITTING AND TRANSFORMATION OF THE DATA PREPROCESSED . 
from sklearn.model_selection import train_test_split # For automatically having a data set in 4 lists [XT,XTRAIN,Y,YTRAIN]
#from sklearn.cross_validation import train_test_split
data = load_boston()
# Having the Train test spliting 
#print(data.target)
X_train,X_test, Y_train, Y_test = train_test_split(data.data,data.target)# data.data are exploratory variable, data.target is Y one



In [17]:

    
# #print(Y_train)
# # Having the scaled Features for the StandardScaler configuration  !
# X_Scaler = StandardScaler() # 
# Y_Scaler= StandardScaler() # Will be used to conver to scaler using the fit_transform stuff
# X_Train = X_Scaler.fit_transform(X_train) # Fitting the trainng data nd then transforming using the standard scaler
# print(X_Train) # DEBUGGIN
# print(np.array([Y_train])) #Debugging
# Y_Train = Y_Scaler.fit_transform(np.array([Y_train])) # Fitting and transforming the response data
# X_Test = X_Scaler.transform(X_test) # Transform the test using the X_test

# #Y_Test = Y_Scaler.transform(np.array([Y_test])) # Transforms the test using the Y_test



In [18]:

    
# regressor = SGDRegressor(loss = 'squared_loss')
# score = cross_val_score(regressor,  X_train , Y_train, cv = 5) # Having the SScore cross validation for the training data set
# print('Cross Validation r-squared Score: ',score)
# print("Average cross validation ",np.mean(score))
# regressor.fit_transform(X_train,Y_train) # Fiting and then transforming!
# print("Test Score : ",regressor.score(X_test,Y_test))



In [19]:

    
import sklearn
sklearn.__version__









    Out[19]:





'0.19.0'



In [2]:

    
from sklearn.linear_model import SGDClassifier # Importing the classifiers
X = [[0., 0.], [1., 1.]] # Sample Exploratory edata set
Y = [0,1 ] # Sample Training Responses which would be matched ! 
clf = SGDClassifier(loss = "hinge", penalty = "l2") # loss --> hinge : soft margin LSVM, log --> logestic Regression , 
clf.fit(X,Y) # Classifier being used to fit the data
print(clf)
# Takes all the things bydefault  , the penalty is L2 , 
# GRADIENT DESCENT ON LINEAR REGRESSION BASED CLASSIFIER
clf.predict([[2,2]]) # PREDICTS THE INPUT DATASET X --> Y 
print(clf.coef_) # prints the coefficent for the Clf. # it has the model parameters  # Currently  two parameters









    



/Users/shafay.amjad/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)






    



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
[[ 9.91080278  9.91080278]]



In [3]:

    
# For checking the intercepts of the CLassifier
print(clf.intercept_) # prints he intercept aka offset or bias  # This can be controlled using the fit_intercept  # Parameter









    



[-9.97004991]



In [5]:

    
# To get the signed distance fro mthe hyhper ploace to the specific point , its like : 
clf.decision_function([[2,2]]) # This is for the Linear modeling using a single variable >  ! ! ! ! !









    Out[5]:





array([ 29.67316119])



In [8]:

    
#HAVING THE LOG AS LOSS
clf = SGDClassifier(loss = "log").fit(X,Y) # Fitting the module using the logestic Loss function regession on the gradient
clf.predict_proba([[1., 1.]]) # Gives a probability vector estimates on the Classifier Showing the result of matching.
# THe first one will produce almost zero , wile the second one would be 0.99 as its the matching element with the trained









    Out[8]:





array([[  4.97248476e-07,   9.99999503e-01]])



In [ ]:

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
count	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000
mean	8.319637	0.527821	0.270976	2.538806	0.087467	15.874922	46.467792	0.996747	3.311113	0.658149	10.422983	5.636023
std	1.741096	0.179060	0.194801	1.409928	0.047065	10.460157	32.895324	0.001887	0.154386	0.169507	1.065668	0.807569
min	4.600000	0.120000	0.000000	0.900000	0.012000	1.000000	6.000000	0.990070	2.740000	0.330000	8.400000	3.000000
25%	7.100000	0.390000	0.090000	1.900000	0.070000	7.000000	22.000000	0.995600	3.210000	0.550000	9.500000	5.000000
50%	7.900000	0.520000	0.260000	2.200000	0.079000	14.000000	38.000000	0.996750	3.310000	0.620000	10.200000	6.000000
75%	9.200000	0.640000	0.420000	2.600000	0.090000	21.000000	62.000000	0.997835	3.400000	0.730000	11.100000	6.000000
max	15.900000	1.580000	1.000000	15.500000	0.611000	72.000000	289.000000	1.003690	4.010000	2.000000	14.900000	8.000000