In [1]:
import pandas as pd
import numpy as np

In [18]:
#explore data

In [8]:
obj = pd.Series([3,5,-2,1])
obj


Out[8]:
0    3
1    5
2   -2
3    1
dtype: int64

In [9]:
obj.values


Out[9]:
array([ 3,  5, -2,  1])

In [10]:
obj.index


Out[10]:
Int64Index([0, 1, 2, 3], dtype='int64')

In [11]:
obj *2


Out[11]:
0     6
1    10
2    -4
3     2
dtype: int64

In [12]:
obj[obj>2]


Out[12]:
0    3
1    5
dtype: int64

In [13]:


In [19]:
data = {'a': 30, 'b': 70, 'c': 160, 'd': 5}
obj = pd.Series(data)
obj


Out[19]:
a     30
b     70
c    160
d      5
dtype: int64

In [20]:
index = ['a','b','c','d','g']
obj = pd.Series(data, index=index)
obj


Out[20]:
a     30
b     70
c    160
d      5
g    NaN
dtype: float64

In [16]:
pd.isnull(obj)


Out[16]:
a    False
b    False
c    False
d    False
dtype: bool

In [17]:
pd.notnull(obj)


Out[17]:
a    True
b    True
c    True
d    True
dtype: bool

In [23]:
data = pd.read_csv("data_example/ad-dataset/ad.data",header=None)

In [ ]:


In [24]:
data.describe()


Out[24]:
4 5 6 7 8 9 10 11 12 13 ... 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
count 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 ... 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000 3279.000000
mean 0.004270 0.011589 0.004575 0.003355 0.003965 0.011589 0.003355 0.004880 0.009149 0.004575 ... 0.006099 0.004575 0.003660 0.002440 0.003050 0.006404 0.012809 0.013419 0.009759 0.001525
std 0.065212 0.107042 0.067491 0.057831 0.062850 0.107042 0.057831 0.069694 0.095227 0.067491 ... 0.077872 0.067491 0.060393 0.049341 0.055148 0.079783 0.112466 0.115077 0.098320 0.039026
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 1554 columns


In [25]:
data.columns


Out[25]:
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558],
           dtype='int64', length=1559)

In [26]:
data.dtypes


Out[26]:
0       object
1       object
2       object
3       object
4        int64
5        int64
6        int64
7        int64
8        int64
9        int64
10       int64
11       int64
12       int64
13       int64
14       int64
15       int64
16       int64
17       int64
18       int64
19       int64
20       int64
21       int64
22       int64
23       int64
24       int64
25       int64
26       int64
27       int64
28       int64
29       int64
         ...  
1529     int64
1530     int64
1531     int64
1532     int64
1533     int64
1534     int64
1535     int64
1536     int64
1537     int64
1538     int64
1539     int64
1540     int64
1541     int64
1542     int64
1543     int64
1544     int64
1545     int64
1546     int64
1547     int64
1548     int64
1549     int64
1550     int64
1551     int64
1552     int64
1553     int64
1554     int64
1555     int64
1556     int64
1557     int64
1558    object
dtype: object

In [6]:
data[1]


Out[6]:
0        125
1        468
2        230
3        468
4        468
5        468
6        460
7        234
8        468
9        468
10         ?
11        52
12        60
13        60
14       230
15       468
16       468
17       125
18       468
19       585
20        60
21        60
22        60
23        60
24         ?
25        52
26        60
27       468
28       234
29       234
        ... 
3249       ?
3250       ?
3251      16
3252      75
3253       ?
3254     100
3255       ?
3256     175
3257       ?
3258       ?
3259     600
3260      64
3261       ?
3262     200
3263      16
3264     184
3265      26
3266     130
3267     192
3268     100
3269       ?
3270       ?
3271       ?
3272     110
3273      30
3274      94
3275     140
3276     120
3277       ?
3278      40
Name: 1, dtype: object

In [27]:
data[[1,20]]


Out[27]:
1 20
0 125 0
1 468 0
2 230 0
3 468 0
4 468 0
5 468 0
6 460 0
7 234 0
8 468 0
9 468 0
10 ? 0
11 52 0
12 60 0
13 60 0
14 230 0
15 468 0
16 468 0
17 125 0
18 468 0
19 585 0
20 60 0
21 60 0
22 60 0
23 60 0
24 ? 0
25 52 0
26 60 0
27 468 0
28 234 0
29 234 0
... ... ...
3249 ? 0
3250 ? 0
3251 16 0
3252 75 0
3253 ? 0
3254 100 0
3255 ? 0
3256 175 0
3257 ? 0
3258 ? 0
3259 600 0
3260 64 0
3261 ? 0
3262 200 0
3263 16 0
3264 184 0
3265 26 0
3266 130 0
3267 192 0
3268 100 0
3269 ? 0
3270 ? 0
3271 ? 0
3272 110 0
3273 30 0
3274 94 0
3275 140 0
3276 120 0
3277 ? 0
3278 40 0

3279 rows × 2 columns


In [28]:
data[1].head()


Out[28]:
0     125
1     468
2     230
3     468
4     468
Name: 1, dtype: object

In [29]:
data[1].head(10)


Out[29]:
0     125
1     468
2     230
3     468
4     468
5     468
6     460
7     234
8     468
9     468
Name: 1, dtype: object

In [7]:
data[1:3]


Out[7]:
0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

2 rows × 1559 columns


In [30]:
#manipulate data

In [31]:
data[data[1]> 0].head(4)


Out[31]:
0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

4 rows × 1559 columns


In [32]:
data[(data[1]> 0) & (data[1558]=='ad.')].head(4)


Out[32]:
0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

4 rows × 1559 columns


In [33]:
data.ix[:3]


Out[33]:
0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

4 rows × 1559 columns


In [34]:
data.iloc[:3]


Out[34]:
0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

3 rows × 1559 columns


In [35]:
data.loc[:3]


Out[35]:
0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

4 rows × 1559 columns


In [36]:
data[1547] = 0

In [37]:
data.ix[3,1]=0

In [38]:
import random
data.ix[0] = [random.randint(0,1) for r in xrange(1558)]+['ad.']

In [40]:
row = [random.randint(0,1) for r in xrange(1558)]+['ad.']
data = data.append(pd.Series(row,index = data.columns),ignore_index=True)

In [70]:
data.loc[len(data)] = row

In [41]:
data['newcolumn'] = 'test value'
data.columns


Out[41]:
Index([           0,            1,            2,            3,            4,
                  5,            6,            7,            8,            9,
       ...
               1550,         1551,         1552,         1553,         1554,
               1555,         1556,         1557,         1558, u'newcolumn'],
      dtype='object', length=1560)

In [56]:
data = data.drop('newcolumn', 1)
data.columns


Out[56]:
Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558],
      dtype='object', length=1559)

In [42]:
data.duplicated()


Out[42]:
0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14       True
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23       True
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
3250    False
3251     True
3252     True
3253     True
3254     True
3255     True
3256    False
3257    False
3258    False
3259     True
3260    False
3261     True
3262     True
3263     True
3264     True
3265    False
3266    False
3267    False
3268     True
3269     True
3270     True
3271     True
3272    False
3273    False
3274    False
3275    False
3276    False
3277     True
3278    False
3279    False
dtype: bool

In [43]:
data[1558].drop_duplicates()


Out[43]:
0         ad.
459    nonad.
Name: 1558, dtype: object

In [44]:
data[1558].drop_duplicates().tolist()


Out[44]:
['ad.', 'nonad.']

In [76]:
adindices = data[data.columns[-1]]== 'ad.'
data.loc[adindices,data.columns[-1]]=1
nonadindices = data[data.columns[-1]]=='nonad.'
data.loc[nonadindices,data.columns[-1]]=0

In [77]:
data[1558].dtypes


Out[77]:
dtype('O')

In [78]:
data[data.columns[-1]]=data[data.columns[-1]].astype(float)

In [79]:
data=data.replace({'?': np.nan})
data=data.replace({'  ?': np.nan})
data=data.replace({'   ?': np.nan})
data=data.replace({'    ?': np.nan})
data=data.replace({'     ?': np.nan})

In [80]:
data=data.dropna()

In [81]:
data=data.fillna(-1)

In [82]:
data=data.apply(lambda x: pd.to_numeric(x))

In [83]:
data1 = pd.DataFrame(columns=[i for i in xrange(1559)])
data1.loc[len(data1)] = [random.randint(0,1) for r in xrange(1558)]+[1]
data1.loc[len(data1)] = [random.randint(0,1) for r in xrange(1558)]+[1]

In [85]:
print len(data)
datatot = pd.concat([data[:],data1[:]])
len(datatot)


2362
Out[85]:
2364

In [ ]: