In [1]:
import pandas as pd
import numpy as np
In [18]:
#explore data
In [8]:
obj = pd.Series([3,5,-2,1])
obj
Out[8]:
In [9]:
obj.values
Out[9]:
In [10]:
obj.index
Out[10]:
In [11]:
obj *2
Out[11]:
In [12]:
obj[obj>2]
Out[12]:
In [13]:
In [19]:
data = {'a': 30, 'b': 70, 'c': 160, 'd': 5}
obj = pd.Series(data)
obj
Out[19]:
In [20]:
index = ['a','b','c','d','g']
obj = pd.Series(data, index=index)
obj
Out[20]:
In [16]:
pd.isnull(obj)
Out[16]:
In [17]:
pd.notnull(obj)
Out[17]:
In [23]:
data = pd.read_csv("data_example/ad-dataset/ad.data",header=None)
In [ ]:
In [24]:
data.describe()
Out[24]:
In [25]:
data.columns
Out[25]:
In [26]:
data.dtypes
Out[26]:
In [6]:
data[1]
Out[6]:
In [27]:
data[[1,20]]
Out[27]:
In [28]:
data[1].head()
Out[28]:
In [29]:
data[1].head(10)
Out[29]:
In [7]:
data[1:3]
Out[7]:
In [30]:
#manipulate data
In [31]:
data[data[1]> 0].head(4)
Out[31]:
In [32]:
data[(data[1]> 0) & (data[1558]=='ad.')].head(4)
Out[32]:
In [33]:
data.ix[:3]
Out[33]:
In [34]:
data.iloc[:3]
Out[34]:
In [35]:
data.loc[:3]
Out[35]:
In [36]:
data[1547] = 0
In [37]:
data.ix[3,1]=0
In [38]:
import random
data.ix[0] = [random.randint(0,1) for r in xrange(1558)]+['ad.']
In [40]:
row = [random.randint(0,1) for r in xrange(1558)]+['ad.']
data = data.append(pd.Series(row,index = data.columns),ignore_index=True)
In [70]:
data.loc[len(data)] = row
In [41]:
data['newcolumn'] = 'test value'
data.columns
Out[41]:
In [56]:
data = data.drop('newcolumn', 1)
data.columns
Out[56]:
In [42]:
data.duplicated()
Out[42]:
In [43]:
data[1558].drop_duplicates()
Out[43]:
In [44]:
data[1558].drop_duplicates().tolist()
Out[44]:
In [76]:
adindices = data[data.columns[-1]]== 'ad.'
data.loc[adindices,data.columns[-1]]=1
nonadindices = data[data.columns[-1]]=='nonad.'
data.loc[nonadindices,data.columns[-1]]=0
In [77]:
data[1558].dtypes
Out[77]:
In [78]:
data[data.columns[-1]]=data[data.columns[-1]].astype(float)
In [79]:
data=data.replace({'?': np.nan})
data=data.replace({' ?': np.nan})
data=data.replace({' ?': np.nan})
data=data.replace({' ?': np.nan})
data=data.replace({' ?': np.nan})
In [80]:
data=data.dropna()
In [81]:
data=data.fillna(-1)
In [82]:
data=data.apply(lambda x: pd.to_numeric(x))
In [83]:
data1 = pd.DataFrame(columns=[i for i in xrange(1559)])
data1.loc[len(data1)] = [random.randint(0,1) for r in xrange(1558)]+[1]
data1.loc[len(data1)] = [random.randint(0,1) for r in xrange(1558)]+[1]
In [85]:
print len(data)
datatot = pd.concat([data[:],data1[:]])
len(datatot)
Out[85]:
In [ ]: