In [2]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import scipy

%matplotlib inline

In [3]:
boston_dataset = datasets.load_boston()
X_full = boston_dataset.data
Y = boston_dataset.target

In [4]:
selector = SelectKBest(f_regression, k=1)
selector.fit(X_full,Y)
k = scipy.array([False, False, False, False, False, False, False, False, False, False,False,False,True], dtype=bool)
X = X_full[:,k] #,selector.get_support()]

In [5]:
plt.scatter(X,Y,color='black')
plt.show()



In [6]:
regressor = LinearRegression(normalize=True)
regressor.fit(X,Y)
plt.scatter(X,Y,color='black')
plt.plot(X, regressor.predict(X),color='blue',linewidth=2)
plt.show()



In [7]:
regressor = SVR()
regressor.fit(X,Y)
plt.scatter(X,Y,color='black')
plt.scatter(X, regressor.predict(X),color='blue',linewidth=1)
plt.show()



In [8]:
regressor = RandomForestRegressor()
regressor.fit(X,Y)
plt.scatter(X,Y,color='black')
plt.scatter(X, regressor.predict(X),color='blue',linewidth=1)
plt.show()



In [9]:
X_full[0]


Out[9]:
array([  6.32000000e-03,   1.80000000e+01,   2.31000000e+00,
         0.00000000e+00,   5.38000000e-01,   6.57500000e+00,
         6.52000000e+01,   4.09000000e+00,   1.00000000e+00,
         2.96000000e+02,   1.53000000e+01,   3.96900000e+02,
         4.98000000e+00])

In [10]:
print (boston_dataset.DESCR)


Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
**References**

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)


In [11]:
selector.get_support()


Out[11]:
array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True], dtype=bool)

In [12]:
X_full[0]


Out[12]:
array([  6.32000000e-03,   1.80000000e+01,   2.31000000e+00,
         0.00000000e+00,   5.38000000e-01,   6.57500000e+00,
         6.52000000e+01,   4.09000000e+00,   1.00000000e+00,
         2.96000000e+02,   1.53000000e+01,   3.96900000e+02,
         4.98000000e+00])

In [13]:
X_full[1]


Out[13]:
array([  2.73100000e-02,   0.00000000e+00,   7.07000000e+00,
         0.00000000e+00,   4.69000000e-01,   6.42100000e+00,
         7.89000000e+01,   4.96710000e+00,   2.00000000e+00,
         2.42000000e+02,   1.78000000e+01,   3.96900000e+02,
         9.14000000e+00])

In [14]:
Y[0]


Out[14]:
24.0

In [15]:
Y[1]


Out[15]:
21.600000000000001

In [16]:
from sklearn import datasets
iris = datasets.load_iris()
import pandas as pd
import numpy as np
colors = list()
palette = {0:"red", 1:"green", 2:"blue"}
for c in np.nditer(iris.target):
    colors.append(palette[int(c)])
dataframe = pd.DataFrame(iris.data,columns=iris.feature_names)
scatterplot=pd.scatter_matrix(dataframe,alpha=0.3,figsize=(10,10),
                              diagonal='hist',color=colors,marker='o',grid=True)



In [17]:
from sklearn.datasets import fetch_mldata
import pandas as pd
earthquakes = fetch_mldata('global-earthquakes')
print (earthquakes.data)
print (earthquakes.data.shape)
dataframe = pd.DataFrame(earthquakes.data)
dataframe.describe()


---------------------------------------------------------------------------
ConnectionRefusedError                    Traceback (most recent call last)
/usr/lib/python3.4/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1181             try:
-> 1182                 h.request(req.get_method(), req.selector, req.data, headers)
   1183             except OSError as err: # timeout error

/usr/lib/python3.4/http/client.py in request(self, method, url, body, headers)
   1087         """Send a complete request to the server."""
-> 1088         self._send_request(method, url, body, headers)
   1089 

/usr/lib/python3.4/http/client.py in _send_request(self, method, url, body, headers)
   1125             body = body.encode('iso-8859-1')
-> 1126         self.endheaders(body)
   1127 

/usr/lib/python3.4/http/client.py in endheaders(self, message_body)
   1083             raise CannotSendHeader()
-> 1084         self._send_output(message_body)
   1085 

/usr/lib/python3.4/http/client.py in _send_output(self, message_body)
    921             message_body = None
--> 922         self.send(msg)
    923         if message_body is not None:

/usr/lib/python3.4/http/client.py in send(self, data)
    856             if self.auto_open:
--> 857                 self.connect()
    858             else:

/usr/lib/python3.4/http/client.py in connect(self)
    833         self.sock = self._create_connection((self.host,self.port),
--> 834                                             self.timeout, self.source_address)
    835 

/usr/lib/python3.4/socket.py in create_connection(address, timeout, source_address)
    511     if err is not None:
--> 512         raise err
    513     else:

/usr/lib/python3.4/socket.py in create_connection(address, timeout, source_address)
    502                 sock.bind(source_address)
--> 503             sock.connect(sa)
    504             return sock

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-17-519adf3a77a8> in <module>()
      1 from sklearn.datasets import fetch_mldata
      2 import pandas as pd
----> 3 earthquakes = fetch_mldata('global-earthquakes')
      4 print (earthquakes.data)
      5 print (earthquakes.data.shape)

/usr/local/lib/python3.4/dist-packages/sklearn/datasets/mldata.py in fetch_mldata(dataname, target_name, data_name, transpose_data, data_home)
    140         urlname = MLDATA_BASE_URL % quote(dataname)
    141         try:
--> 142             mldata_url = urlopen(urlname)
    143         except HTTPError as e:
    144             if e.code == 404:

/usr/lib/python3.4/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    159     else:
    160         opener = _opener
--> 161     return opener.open(url, data, timeout)
    162 
    163 def install_opener(opener):

/usr/lib/python3.4/urllib/request.py in open(self, fullurl, data, timeout)
    461             req = meth(req)
    462 
--> 463         response = self._open(req, data)
    464 
    465         # post-process response

/usr/lib/python3.4/urllib/request.py in _open(self, req, data)
    479         protocol = req.type
    480         result = self._call_chain(self.handle_open, protocol, protocol +
--> 481                                   '_open', req)
    482         if result:
    483             return result

/usr/lib/python3.4/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    439         for handler in handlers:
    440             func = getattr(handler, meth_name)
--> 441             result = func(*args)
    442             if result is not None:
    443                 return result

/usr/lib/python3.4/urllib/request.py in http_open(self, req)
   1208 
   1209     def http_open(self, req):
-> 1210         return self.do_open(http.client.HTTPConnection, req)
   1211 
   1212     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python3.4/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1182                 h.request(req.get_method(), req.selector, req.data, headers)
   1183             except OSError as err: # timeout error
-> 1184                 raise URLError(err)
   1185             r = h.getresponse()
   1186         except:

URLError: <urlopen error [Errno 111] Connection refused>

In [ ]:
iris_filename='data/datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename,sep=',',decimal='.',header=None,
                   names=['sepal_length','sepal_width','petal_length','petal_width','target'])
print (type(iris))
iris_data = iris.values[:,:4]
iris_target,iris_target_labels=pd.factorize(iris.target)
print (iris_data.shape, iris_target.shape)
iris.describe()

In [ ]:
from sklearn import datasets
X,y = datasets.make_classification(n_samples=10**6,n_features=10,random_state=101)
print (X.shape, y.shape)
X[0]
y

In [ ]:
X[0]

In [ ]:
from sklearn.datasets import make_moons, make_circles, make_classification
moon = make_moons(noise=0.3, random_state=0)
circle = make_circles(noise=0.2, factor=0.5, random_state=1)
#linear = linearly_separable

print (moon[0].shape)

In [ ]: