In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
In [2]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
Out[2]:
In [3]:
data.duplicated()
Out[3]:
In [4]:
data.drop_duplicates()
Out[4]:
In [5]:
data['v1'] = np.arange(7)
data
Out[5]:
In [6]:
data.drop_duplicates(['k1'])
Out[6]:
In [9]:
data.drop_duplicates(['k1', 'k2'], keep = 'last')
Out[9]:
In [10]:
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
Out[10]:
In [11]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
In [13]:
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
Out[13]:
In [14]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
Out[14]:
In [15]:
data = Series([1., -999., 2., -999., -1000., 3.])
data
Out[15]:
In [16]:
data.replace(-999, np.nan)
Out[16]:
In [17]:
data.replace([-999, -1000], np.nan)
Out[17]:
In [18]:
data.replace([-999, -1000], [np.nan, 0])
Out[18]:
In [19]:
data.replace({-999: np.nan, -1000: 0})
Out[19]:
In [20]:
data = DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
In [21]:
data.index.map(str.upper)
Out[21]:
In [22]:
data
Out[22]:
In [23]:
data.index = data.index.map(str.upper)
data
Out[23]:
In [25]:
data.rename(index = str.title, columns = str.upper)
Out[25]:
In [26]:
data
Out[26]:
In [27]:
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
Out[27]:
In [28]:
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
Out[28]:
In [30]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
Out[30]:
In [32]:
cats.codes
Out[32]:
In [36]:
cats.value_counts()
Out[36]:
In [37]:
pd.cut(ages, bins, right = False)
Out[37]:
In [38]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
Out[38]:
In [39]:
data = np.random.rand(20)
pd.cut(data, 4, precision = 2)
Out[39]:
In [41]:
pd.cut(data, 4, precision = 2).value_counts()
Out[41]:
In [42]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats
Out[42]:
In [43]:
cats.value_counts()
Out[43]:
In [44]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
Out[44]:
In [45]:
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()
Out[45]:
In [46]:
col = data[3]
col[np.abs(col) > 3]
Out[46]:
In [48]:
data[(np.abs(data) > 3).any(1)]
Out[48]:
In [52]:
data = np.where(np.abs(data) > 3, np.sign(data) * 3, data)
data = DataFrame(data)
data.describe()
Out[52]:
In [53]:
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
Out[53]:
In [54]:
df
Out[54]:
In [55]:
df.take(sampler)
Out[55]:
In [56]:
df.take(np.random.permutation(len(df))[:3])
Out[56]:
In [57]:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size = 10)
sampler
Out[57]:
In [59]:
draws = bag.take(sampler)
draws
Out[59]:
In [60]:
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df
Out[60]:
In [64]:
dummies = pd.get_dummies(df['key'])
dummies
Out[64]:
In [62]:
pd.get_dummies(df['key'], prefix='key_')
Out[62]:
In [66]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
Out[66]:
In [68]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None,
names=mnames, engine = 'python')
movies[:10]
Out[68]:
In [73]:
genre_iter = (set(x.split('|')) for x in movies.genres)
print(type(genre_iter))
genres = sorted(set.union(*genre_iter))
genres
Out[73]:
In [76]:
dummies = DataFrame(np.zeros((len(movies), len(genres))).astype(np.int32), columns = genres)
dummies
Out[76]:
In [78]:
for i, gen in enumerate(movies.genres):
dummies.ix[i, gen.split('|')] = 1
dummies
Out[78]:
In [79]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
Out[79]:
In [80]:
values = np.random.rand(10)
values
Out[80]:
In [81]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
Out[81]:
In [ ]: