In [1]:
import pandas as pd
iris_filename = 'data/datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',',decimal='.',header=None,
                   names=['sepal_length','sepal_width','petal_length','petal_width','target'])

In [2]:
iris.describe()


Out[2]:
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

In [3]:
iris.fillna(1)


Out[3]:
sepal_length sepal_width petal_length petal_width target
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
7 5.0 3.4 1.5 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
10 5.4 3.7 1.5 0.2 Iris-setosa
11 4.8 3.4 1.6 0.2 Iris-setosa
12 4.8 3.0 1.4 0.1 Iris-setosa
13 4.3 3.0 1.1 0.1 Iris-setosa
14 5.8 4.0 1.2 0.2 Iris-setosa
15 5.7 4.4 1.5 0.4 Iris-setosa
16 5.4 3.9 1.3 0.4 Iris-setosa
17 5.1 3.5 1.4 0.3 Iris-setosa
18 5.7 3.8 1.7 0.3 Iris-setosa
19 5.1 3.8 1.5 0.3 Iris-setosa
20 5.4 3.4 1.7 0.2 Iris-setosa
21 5.1 3.7 1.5 0.4 Iris-setosa
22 4.6 3.6 1.0 0.2 Iris-setosa
23 5.1 3.3 1.7 0.5 Iris-setosa
24 4.8 3.4 1.9 0.2 Iris-setosa
25 5.0 3.0 1.6 0.2 Iris-setosa
26 5.0 3.4 1.6 0.4 Iris-setosa
27 5.2 3.5 1.5 0.2 Iris-setosa
28 5.2 3.4 1.4 0.2 Iris-setosa
29 4.7 3.2 1.6 0.2 Iris-setosa
... ... ... ... ... ...
120 6.9 3.2 5.7 2.3 Iris-virginica
121 5.6 2.8 4.9 2.0 Iris-virginica
122 7.7 2.8 6.7 2.0 Iris-virginica
123 6.3 2.7 4.9 1.8 Iris-virginica
124 6.7 3.3 5.7 2.1 Iris-virginica
125 7.2 3.2 6.0 1.8 Iris-virginica
126 6.2 2.8 4.8 1.8 Iris-virginica
127 6.1 3.0 4.9 1.8 Iris-virginica
128 6.4 2.8 5.6 2.1 Iris-virginica
129 7.2 3.0 5.8 1.6 Iris-virginica
130 7.4 2.8 6.1 1.9 Iris-virginica
131 7.9 3.8 6.4 2.0 Iris-virginica
132 6.4 2.8 5.6 2.2 Iris-virginica
133 6.3 2.8 5.1 1.5 Iris-virginica
134 6.1 2.6 5.6 1.4 Iris-virginica
135 7.7 3.0 6.1 2.3 Iris-virginica
136 6.3 3.4 5.6 2.4 Iris-virginica
137 6.4 3.1 5.5 1.8 Iris-virginica
138 6.0 3.0 4.8 1.8 Iris-virginica
139 6.9 3.1 5.4 2.1 Iris-virginica
140 6.7 3.1 5.6 2.4 Iris-virginica
141 6.9 3.1 5.1 2.3 Iris-virginica
142 5.8 2.7 5.1 1.9 Iris-virginica
143 6.8 3.2 5.9 2.3 Iris-virginica
144 6.7 3.3 5.7 2.5 Iris-virginica
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns


In [4]:
bad_dataset = pd.read_csv('data/loading_example_1.csv',error_bad_lines=False)


Skipping line 4: expected 3 fields, saw 4


In [5]:
bad_dataset.fillna(-1)


Out[5]:
Val1 Val2 Val3
0 0 0 0
1 1 1 1
2 3 3 3
3 4 4 -1

In [6]:
bad_dataset.describe()


Out[6]:
Val1 Val2 Val3
count 4.000000 4.000000 3.000000
mean 2.000000 2.000000 1.333333
std 1.825742 1.825742 1.527525
min 0.000000 0.000000 0.000000
25% 0.750000 0.750000 0.500000
50% 2.000000 2.000000 1.000000
75% 3.250000 3.250000 2.000000
max 4.000000 4.000000 3.000000

In [7]:
bad_dataset


Out[7]:
Val1 Val2 Val3
0 0 0 0
1 1 1 1
2 3 3 3
3 4 4 NaN

In [8]:
iris_chunks = pd.read_csv(iris_filename,header=None,names=['c1','c2','c3','c4','c5'],chunksize=10)

In [9]:
iris_chunks


Out[9]:
<pandas.io.parsers.TextFileReader at 0x7f8438898210>

In [10]:
for chunk in iris_chunks:
    print chunk.shape
    print chunk


(10, 5)
    c1   c2   c3   c4           c5
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa
(10, 5)
    c1   c2   c3   c4           c5
0  5.4  3.7  1.5  0.2  Iris-setosa
1  4.8  3.4  1.6  0.2  Iris-setosa
2  4.8  3.0  1.4  0.1  Iris-setosa
3  4.3  3.0  1.1  0.1  Iris-setosa
4  5.8  4.0  1.2  0.2  Iris-setosa
5  5.7  4.4  1.5  0.4  Iris-setosa
6  5.4  3.9  1.3  0.4  Iris-setosa
7  5.1  3.5  1.4  0.3  Iris-setosa
8  5.7  3.8  1.7  0.3  Iris-setosa
9  5.1  3.8  1.5  0.3  Iris-setosa
(10, 5)
    c1   c2   c3   c4           c5
0  5.4  3.4  1.7  0.2  Iris-setosa
1  5.1  3.7  1.5  0.4  Iris-setosa
2  4.6  3.6  1.0  0.2  Iris-setosa
3  5.1  3.3  1.7  0.5  Iris-setosa
4  4.8  3.4  1.9  0.2  Iris-setosa
5  5.0  3.0  1.6  0.2  Iris-setosa
6  5.0  3.4  1.6  0.4  Iris-setosa
7  5.2  3.5  1.5  0.2  Iris-setosa
8  5.2  3.4  1.4  0.2  Iris-setosa
9  4.7  3.2  1.6  0.2  Iris-setosa
(10, 5)
    c1   c2   c3   c4           c5
0  4.8  3.1  1.6  0.2  Iris-setosa
1  5.4  3.4  1.5  0.4  Iris-setosa
2  5.2  4.1  1.5  0.1  Iris-setosa
3  5.5  4.2  1.4  0.2  Iris-setosa
4  4.9  3.1  1.5  0.1  Iris-setosa
5  5.0  3.2  1.2  0.2  Iris-setosa
6  5.5  3.5  1.3  0.2  Iris-setosa
7  4.9  3.1  1.5  0.1  Iris-setosa
8  4.4  3.0  1.3  0.2  Iris-setosa
9  5.1  3.4  1.5  0.2  Iris-setosa
(10, 5)
    c1   c2   c3   c4           c5
0  5.0  3.5  1.3  0.3  Iris-setosa
1  4.5  2.3  1.3  0.3  Iris-setosa
2  4.4  3.2  1.3  0.2  Iris-setosa
3  5.0  3.5  1.6  0.6  Iris-setosa
4  5.1  3.8  1.9  0.4  Iris-setosa
5  4.8  3.0  1.4  0.3  Iris-setosa
6  5.1  3.8  1.6  0.2  Iris-setosa
7  4.6  3.2  1.4  0.2  Iris-setosa
8  5.3  3.7  1.5  0.2  Iris-setosa
9  5.0  3.3  1.4  0.2  Iris-setosa
(10, 5)
    c1   c2   c3   c4               c5
0  7.0  3.2  4.7  1.4  Iris-versicolor
1  6.4  3.2  4.5  1.5  Iris-versicolor
2  6.9  3.1  4.9  1.5  Iris-versicolor
3  5.5  2.3  4.0  1.3  Iris-versicolor
4  6.5  2.8  4.6  1.5  Iris-versicolor
5  5.7  2.8  4.5  1.3  Iris-versicolor
6  6.3  3.3  4.7  1.6  Iris-versicolor
7  4.9  2.4  3.3  1.0  Iris-versicolor
8  6.6  2.9  4.6  1.3  Iris-versicolor
9  5.2  2.7  3.9  1.4  Iris-versicolor
(10, 5)
    c1   c2   c3   c4               c5
0  5.0  2.0  3.5  1.0  Iris-versicolor
1  5.9  3.0  4.2  1.5  Iris-versicolor
2  6.0  2.2  4.0  1.0  Iris-versicolor
3  6.1  2.9  4.7  1.4  Iris-versicolor
4  5.6  2.9  3.6  1.3  Iris-versicolor
5  6.7  3.1  4.4  1.4  Iris-versicolor
6  5.6  3.0  4.5  1.5  Iris-versicolor
7  5.8  2.7  4.1  1.0  Iris-versicolor
8  6.2  2.2  4.5  1.5  Iris-versicolor
9  5.6  2.5  3.9  1.1  Iris-versicolor
(10, 5)
    c1   c2   c3   c4               c5
0  5.9  3.2  4.8  1.8  Iris-versicolor
1  6.1  2.8  4.0  1.3  Iris-versicolor
2  6.3  2.5  4.9  1.5  Iris-versicolor
3  6.1  2.8  4.7  1.2  Iris-versicolor
4  6.4  2.9  4.3  1.3  Iris-versicolor
5  6.6  3.0  4.4  1.4  Iris-versicolor
6  6.8  2.8  4.8  1.4  Iris-versicolor
7  6.7  3.0  5.0  1.7  Iris-versicolor
8  6.0  2.9  4.5  1.5  Iris-versicolor
9  5.7  2.6  3.5  1.0  Iris-versicolor
(10, 5)
    c1   c2   c3   c4               c5
0  5.5  2.4  3.8  1.1  Iris-versicolor
1  5.5  2.4  3.7  1.0  Iris-versicolor
2  5.8  2.7  3.9  1.2  Iris-versicolor
3  6.0  2.7  5.1  1.6  Iris-versicolor
4  5.4  3.0  4.5  1.5  Iris-versicolor
5  6.0  3.4  4.5  1.6  Iris-versicolor
6  6.7  3.1  4.7  1.5  Iris-versicolor
7  6.3  2.3  4.4  1.3  Iris-versicolor
8  5.6  3.0  4.1  1.3  Iris-versicolor
9  5.5  2.5  4.0  1.3  Iris-versicolor
(10, 5)
    c1   c2   c3   c4               c5
0  5.5  2.6  4.4  1.2  Iris-versicolor
1  6.1  3.0  4.6  1.4  Iris-versicolor
2  5.8  2.6  4.0  1.2  Iris-versicolor
3  5.0  2.3  3.3  1.0  Iris-versicolor
4  5.6  2.7  4.2  1.3  Iris-versicolor
5  5.7  3.0  4.2  1.2  Iris-versicolor
6  5.7  2.9  4.2  1.3  Iris-versicolor
7  6.2  2.9  4.3  1.3  Iris-versicolor
8  5.1  2.5  3.0  1.1  Iris-versicolor
9  5.7  2.8  4.1  1.3  Iris-versicolor
(10, 5)
    c1   c2   c3   c4              c5
0  6.3  3.3  6.0  2.5  Iris-virginica
1  5.8  2.7  5.1  1.9  Iris-virginica
2  7.1  3.0  5.9  2.1  Iris-virginica
3  6.3  2.9  5.6  1.8  Iris-virginica
4  6.5  3.0  5.8  2.2  Iris-virginica
5  7.6  3.0  6.6  2.1  Iris-virginica
6  4.9  2.5  4.5  1.7  Iris-virginica
7  7.3  2.9  6.3  1.8  Iris-virginica
8  6.7  2.5  5.8  1.8  Iris-virginica
9  7.2  3.6  6.1  2.5  Iris-virginica
(10, 5)
    c1   c2   c3   c4              c5
0  6.5  3.2  5.1  2.0  Iris-virginica
1  6.4  2.7  5.3  1.9  Iris-virginica
2  6.8  3.0  5.5  2.1  Iris-virginica
3  5.7  2.5  5.0  2.0  Iris-virginica
4  5.8  2.8  5.1  2.4  Iris-virginica
5  6.4  3.2  5.3  2.3  Iris-virginica
6  6.5  3.0  5.5  1.8  Iris-virginica
7  7.7  3.8  6.7  2.2  Iris-virginica
8  7.7  2.6  6.9  2.3  Iris-virginica
9  6.0  2.2  5.0  1.5  Iris-virginica
(10, 5)
    c1   c2   c3   c4              c5
0  6.9  3.2  5.7  2.3  Iris-virginica
1  5.6  2.8  4.9  2.0  Iris-virginica
2  7.7  2.8  6.7  2.0  Iris-virginica
3  6.3  2.7  4.9  1.8  Iris-virginica
4  6.7  3.3  5.7  2.1  Iris-virginica
5  7.2  3.2  6.0  1.8  Iris-virginica
6  6.2  2.8  4.8  1.8  Iris-virginica
7  6.1  3.0  4.9  1.8  Iris-virginica
8  6.4  2.8  5.6  2.1  Iris-virginica
9  7.2  3.0  5.8  1.6  Iris-virginica
(10, 5)
    c1   c2   c3   c4              c5
0  7.4  2.8  6.1  1.9  Iris-virginica
1  7.9  3.8  6.4  2.0  Iris-virginica
2  6.4  2.8  5.6  2.2  Iris-virginica
3  6.3  2.8  5.1  1.5  Iris-virginica
4  6.1  2.6  5.6  1.4  Iris-virginica
5  7.7  3.0  6.1  2.3  Iris-virginica
6  6.3  3.4  5.6  2.4  Iris-virginica
7  6.4  3.1  5.5  1.8  Iris-virginica
8  6.0  3.0  4.8  1.8  Iris-virginica
9  6.9  3.1  5.4  2.1  Iris-virginica
(10, 5)
    c1   c2   c3   c4              c5
0  6.7  3.1  5.6  2.4  Iris-virginica
1  6.9  3.1  5.1  2.3  Iris-virginica
2  5.8  2.7  5.1  1.9  Iris-virginica
3  6.8  3.2  5.9  2.3  Iris-virginica
4  6.7  3.3  5.7  2.5  Iris-virginica
5  6.7  3.0  5.2  2.3  Iris-virginica
6  6.3  2.5  5.0  1.9  Iris-virginica
7  6.5  3.0  5.2  2.0  Iris-virginica
8  6.2  3.4  5.4  2.3  Iris-virginica
9  5.9  3.0  5.1  1.8  Iris-virginica

In [11]:
iris_iterator = pd.read_csv(iris_filename,header=None,names=['c1','c2','c3','c4','c5'],iterator=True)
print iris_iterator.get_chunk(10)
print iris_iterator.get_chunk(20)
piece = iris_iterator.get_chunk(2)
piece


    c1   c2   c3   c4           c5
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa
     c1   c2   c3   c4           c5
0   5.4  3.7  1.5  0.2  Iris-setosa
1   4.8  3.4  1.6  0.2  Iris-setosa
2   4.8  3.0  1.4  0.1  Iris-setosa
3   4.3  3.0  1.1  0.1  Iris-setosa
4   5.8  4.0  1.2  0.2  Iris-setosa
5   5.7  4.4  1.5  0.4  Iris-setosa
6   5.4  3.9  1.3  0.4  Iris-setosa
7   5.1  3.5  1.4  0.3  Iris-setosa
8   5.7  3.8  1.7  0.3  Iris-setosa
9   5.1  3.8  1.5  0.3  Iris-setosa
10  5.4  3.4  1.7  0.2  Iris-setosa
11  5.1  3.7  1.5  0.4  Iris-setosa
12  4.6  3.6  1.0  0.2  Iris-setosa
13  5.1  3.3  1.7  0.5  Iris-setosa
14  4.8  3.4  1.9  0.2  Iris-setosa
15  5.0  3.0  1.6  0.2  Iris-setosa
16  5.0  3.4  1.6  0.4  Iris-setosa
17  5.2  3.5  1.5  0.2  Iris-setosa
18  5.2  3.4  1.4  0.2  Iris-setosa
19  4.7  3.2  1.6  0.2  Iris-setosa
Out[11]:
c1 c2 c3 c4 c5
0 4.8 3.1 1.6 0.2 Iris-setosa
1 5.4 3.4 1.5 0.4 Iris-setosa

In [12]:
import csv
with open(iris_filename,'rb') as data_stream:
    for n,row in enumerate(csv.DictReader(data_stream,
                                          fieldnames=['sepal_length','sepal_width','petal_length','petal_width','target'],
                                          dialect='excel')):
        if n==0:
            print n,row
        else:
            break


0 {'sepal_width': '3.5', 'petal_width': '0.2', 'target': 'Iris-setosa', 'sepal_length': '5.1', 'petal_length': '1.4'}

In [13]:
import csv
with open(iris_filename,'rb') as data_stream:
    for n,row in enumerate(csv.reader(data_stream,dialect='excel')):
        if n==0:
            print n,row
        else:
            break


0 ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']

In [14]:
import pandas as pd
my_own_datasest = pd.DataFrame({'col1':range(5),'col2':[1.0]*5,'col3':1.9,'col4':'Hello!'})
print my_own_datasest
print my_own_datasest.dtypes


   col1  col2  col3    col4
0     0     1   1.9  Hello!
1     1     1   1.9  Hello!
2     2     1   1.9  Hello!
3     3     1   1.9  Hello!
4     4     1   1.9  Hello!
col1      int64
col2    float64
col3    float64
col4     object
dtype: object

In [15]:
my_own_datasest['col1']=my_own_datasest['col1'].astype(float)
print my_own_datasest.dtypes


col1    float64
col2    float64
col3    float64
col4     object
dtype: object

In [16]:
mask_feature = iris['sepal_length'] > 6.0
print mask_feature.head(10)


0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: sepal_length, dtype: bool

In [17]:
mask_target=iris['target']=='Iris-virginica'
print mask_target.head(5)


0    False
1    False
2    False
3    False
4    False
Name: target, dtype: bool

In [18]:
print iris['target'].unique()


['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

In [19]:
ground_targets_mean = iris.groupby(['target']).mean()
print ground_targets_mean


                 sepal_length  sepal_width  petal_length  petal_width
target                                                               
Iris-setosa             5.006        3.418         1.464        0.244
Iris-versicolor         5.936        2.770         4.260        1.326
Iris-virginica          6.588        2.974         5.552        2.026

In [20]:
iris.sort_index(by='sepal_length').head()


Out[20]:
sepal_length sepal_width petal_length petal_width target
13 4.3 3.0 1.1 0.1 Iris-setosa
42 4.4 3.2 1.3 0.2 Iris-setosa
38 4.4 3.0 1.3 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
41 4.5 2.3 1.3 0.3 Iris-setosa

In [21]:
dataset = pd.read_csv('data/loading_example_2.csv',index_col=0)
dataset


Out[21]:
val1 val2 val3
n
100 10 10 C
101 10 20 C
102 10 30 B
103 10 40 B
104 10 50 A

In [22]:
print dataset['val3'][104]
print dataset.loc[104,'val3']
print dataset.ix[104,2]
print dataset.iloc[4,2]
print dataset.ix[range(100,102),['val2','val3']]


A
A
A
A
     val2 val3
100    10    C
101    20    C

In [23]:
import pandas as pd
categorical_feature = pd.Series(['sunny','cloudy','snowy','rainy','foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping


Out[23]:
cloudy foggy rainy snowy sunny
0 0 0 0 0 1
1 1 0 0 0 0
2 0 0 0 1 0
3 0 0 1 0 0
4 0 1 0 0 0

In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()

levels = ['sunny','cloudy','snowy','rainy','foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]], [fit_levs[4]]])

print ohe.transform([le.transform(['sunny'])]).toarray()
print ohe.transform([le.transform(['cloudy'])]).toarray()


[[ 0.  0.  0.  0.  1.]]
[[ 1.  0.  0.  0.  0.]]

In [25]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med','sci.space']
twenty_sci_news = fetch_20newsgroups(categories = categories)
print twenty_sci_news.data[0]
print twenty_sci_news.filenames
print twenty_sci_news.target[0]
print twenty_sci_news.target_names[twenty_sci_news.target[0]]

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
print word_count.shape
print word_count[0]

word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
    print "word:", word_list[n], " apperas", word_count[0,n], " times"

from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False,norm='l1')
word_freq=tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print "word:",word_list[n], " has frequency ", word_freq[0,n]


---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
<ipython-input-25-0287e3069504> in <module>()
      1 from sklearn.datasets import fetch_20newsgroups
      2 categories = ['sci.med','sci.space']
----> 3 twenty_sci_news = fetch_20newsgroups(categories = categories)
      4 print twenty_sci_news.data[0]
      5 print twenty_sci_news.filenames

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in fetch_20newsgroups(data_home, subset, categories, shuffle, random_state, remove, download_if_missing)
    221         if download_if_missing:
    222             cache = download_20newsgroups(target_dir=twenty_home,
--> 223                                           cache_path=cache_path)
    224         else:
    225             raise IOError('20Newsgroups dataset not found')

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in download_20newsgroups(target_dir, cache_path)
     89 
     90     logger.warning("Downloading dataset from %s (14 MB)", URL)
---> 91     opener = urlopen(URL)
     92     with open(archive_path, 'wb') as f:
     93         f.write(opener.read())

/usr/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout)
    125     if _opener is None:
    126         _opener = build_opener()
--> 127     return _opener.open(url, data, timeout)
    128 
    129 def install_opener(opener):

/usr/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    402             req = meth(req)
    403 
--> 404         response = self._open(req, data)
    405 
    406         # post-process response

/usr/lib/python2.7/urllib2.pyc in _open(self, req, data)
    420         protocol = req.get_type()
    421         result = self._call_chain(self.handle_open, protocol, protocol +
--> 422                                   '_open', req)
    423         if result:
    424             return result

/usr/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    380             func = getattr(handler, meth_name)
    381 
--> 382             result = func(*args)
    383             if result is not None:
    384                 return result

/usr/lib/python2.7/urllib2.pyc in http_open(self, req)
   1212 
   1213     def http_open(self, req):
-> 1214         return self.do_open(httplib.HTTPConnection, req)
   1215 
   1216     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python2.7/urllib2.pyc in do_open(self, http_class, req)
   1182         except socket.error, err: # XXX what error?
   1183             h.close()
-> 1184             raise URLError(err)
   1185         else:
   1186             try:

URLError: <urlopen error [Errno 111] Connection refused>

In [26]:
text1='we love data science'
text2='data science is hard'
documents = [text1,text2]
print documents

count_vect_1_grams = CountVectorizer(ngram_range=(1,1),stop_words=[],min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print 'word list = ', word_list
print "text1 is described with", [word_list[n] + "("+ str(word_count[0,n]) + ")" for n in word_count[0].indices]


['we love data science', 'data science is hard']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-10496f17b104> in <module>()
      4 print documents
      5 
----> 6 count_vect_1_grams = CountVectorizer(ngram_range=(1,1),stop_words=[],min_df=1)
      7 word_count = count_vect_1_grams.fit_transform(documents)
      8 word_list = count_vect_1_grams.get_feature_names()

NameError: name 'CountVectorizer' is not defined

In [27]:
count_vect_1_grams = CountVectorizer(ngram_range=(2,2),stop_words=[],min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()

print 'word list = ', word_list
print "text1 is described with", [word_list[n] + "("+ str(word_count[0,n]) + ")" for n in word_count[0].indices]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-4fe84e6681f9> in <module>()
----> 1 count_vect_1_grams = CountVectorizer(ngram_range=(2,2),stop_words=[],min_df=1)
      2 word_count = count_vect_1_grams.fit_transform(documents)
      3 word_list = count_vect_1_grams.get_feature_names()
      4 
      5 print 'word list = ', word_list

NameError: name 'CountVectorizer' is not defined

In [28]:
count_vect_1_grams = CountVectorizer(ngram_range=(1,2),stop_words=[],min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()

print 'word list = ', word_list
print "text1 is described with", [word_list[n] + "("+ str(word_count[0,n]) + ")" for n in word_count[0].indices]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-28-f5f761955708> in <module>()
----> 1 count_vect_1_grams = CountVectorizer(ngram_range=(1,2),stop_words=[],min_df=1)
      2 word_count = count_vect_1_grams.fit_transform(documents)
      3 word_list = count_vect_1_grams.get_feature_names()
      4 
      5 print 'word list = ', word_list

NameError: name 'CountVectorizer' is not defined

In [10]:



SVC with rbf kenerl
0.910400657603
0.000714888476631

In [ ]: