In [1]:

    
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

Data transformation

Removing duplicates



In [2]:

    
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data



In [3]:

    
data.duplicated()









    Out[3]:





0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool



In [4]:

    
data.drop_duplicates()



In [5]:

    
data['v1'] = np.arange(7)
data



In [6]:

    
data.drop_duplicates(['k1'])



In [9]:

    
data.drop_duplicates(['k1', 'k2'], keep = 'last')

Transforming data using a function or mapping



In [10]:

    
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data









    Out[10]:






  
    
      
      food
      ounces
    
  
  
    
      0
      bacon
      4.0
    
    
      1
      pulled pork
      3.0
    
    
      2
      bacon
      12.0
    
    
      3
      Pastrami
      6.0
    
    
      4
      corned beef
      7.5
    
    
      5
      Bacon
      8.0
    
    
      6
      pastrami
      3.0
    
    
      7
      honey ham
      5.0
    
    
      8
      nova lox
      6.0



In [11]:

    
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}



In [13]:

    
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data









    Out[13]:






  
    
      
      food
      ounces
      animal
    
  
  
    
      0
      bacon
      4.0
      pig
    
    
      1
      pulled pork
      3.0
      pig
    
    
      2
      bacon
      12.0
      pig
    
    
      3
      Pastrami
      6.0
      cow
    
    
      4
      corned beef
      7.5
      cow
    
    
      5
      Bacon
      8.0
      pig
    
    
      6
      pastrami
      3.0
      cow
    
    
      7
      honey ham
      5.0
      pig
    
    
      8
      nova lox
      6.0
      salmon



In [14]:

    
data['food'].map(lambda x: meat_to_animal[x.lower()])









    Out[14]:





0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing values



In [15]:

    
data = Series([1., -999., 2., -999., -1000., 3.])
data









    Out[15]:





0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64



In [16]:

    
data.replace(-999, np.nan)









    Out[16]:





0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64



In [17]:

    
data.replace([-999, -1000], np.nan)









    Out[17]:





0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64



In [18]:

    
data.replace([-999, -1000], [np.nan, 0])









    Out[18]:





0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64



In [19]:

    
data.replace({-999: np.nan, -1000: 0})









    Out[19]:





0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renaming axis indexes



In [20]:

    
data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])



In [21]:

    
data.index.map(str.upper)









    Out[21]:





array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)



In [22]:

    
data



In [23]:

    
data.index = data.index.map(str.upper)
data



In [25]:

    
data.rename(index = str.title, columns = str.upper)



In [26]:

    
data



In [27]:

    
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})



In [28]:

    
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Discretization and binning



In [30]:

    
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats









    Out[30]:





[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]



In [32]:

    
cats.codes









    Out[32]:





array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)



In [36]:

    
cats.value_counts()









    Out[36]:





(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64



In [37]:

    
pd.cut(ages, bins, right = False)









    Out[37]:





[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, object): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]



In [38]:

    
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)









    Out[38]:





[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]



In [39]:

    
data = np.random.rand(20)
pd.cut(data, 4, precision = 2)









    Out[39]:





[(0.064, 0.29], (0.74, 0.97], (0.29, 0.52], (0.29, 0.52], (0.74, 0.97], ..., (0.52, 0.74], (0.74, 0.97], (0.74, 0.97], (0.064, 0.29], (0.29, 0.52]]
Length: 20
Categories (4, object): [(0.064, 0.29] < (0.29, 0.52] < (0.52, 0.74] < (0.74, 0.97]]



In [41]:

    
pd.cut(data, 4, precision = 2).value_counts()









    Out[41]:





(0.064, 0.29]    6
(0.29, 0.52]     5
(0.52, 0.74]     2
(0.74, 0.97]     7
dtype: int64



In [42]:

    
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats









    Out[42]:





[[-3.568, -0.667], [-3.568, -0.667], (0.687, 2.922], (-0.667, -0.0231], (-0.667, -0.0231], ..., (-0.0231, 0.687], [-3.568, -0.667], (-0.667, -0.0231], (-0.667, -0.0231], [-3.568, -0.667]]
Length: 1000
Categories (4, object): [[-3.568, -0.667] < (-0.667, -0.0231] < (-0.0231, 0.687] < (0.687, 2.922]]



In [43]:

    
cats.value_counts()









    Out[43]:





[-3.568, -0.667]     250
(-0.667, -0.0231]    250
(-0.0231, 0.687]     250
(0.687, 2.922]       250
dtype: int64



In [44]:

    
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])









    Out[44]:





[(-1.277, -0.0231], [-3.568, -1.277], (1.303, 2.922], (-1.277, -0.0231], (-1.277, -0.0231], ..., (-0.0231, 1.303], (-1.277, -0.0231], (-1.277, -0.0231], (-1.277, -0.0231], [-3.568, -1.277]]
Length: 1000
Categories (4, object): [[-3.568, -1.277] < (-1.277, -0.0231] < (-0.0231, 1.303] < (1.303, 2.922]]

Detecting and filtering outliers



In [45]:

    
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()









    Out[45]:






  
    
      
      0
      1
      2
      3
    
  
  
    
      count
      1000.000000
      1000.000000
      1000.000000
      1000.000000
    
    
      mean
      -0.067684
      0.067924
      0.025598
      -0.002298
    
    
      std
      0.998035
      0.992106
      1.006835
      0.996794
    
    
      min
      -3.428254
      -3.548824
      -3.184377
      -3.745356
    
    
      25%
      -0.774890
      -0.591841
      -0.641675
      -0.644144
    
    
      50%
      -0.116401
      0.101143
      0.002073
      -0.013611
    
    
      75%
      0.616366
      0.780282
      0.680391
      0.654328
    
    
      max
      3.366626
      2.653656
      3.260383
      3.927528



In [46]:

    
col = data[3]
col[np.abs(col) > 3]









    Out[46]:





97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64



In [48]:

    
data[(np.abs(data) > 3).any(1)]



In [52]:

    
data = np.where(np.abs(data) > 3, np.sign(data) * 3, data)
data = DataFrame(data)
data.describe()









    Out[52]:






  
    
      
      0
      1
      2
      3
    
  
  
    
      count
      1000.000000
      1000.000000
      1000.000000
      1000.000000
    
    
      mean
      -0.067623
      0.068473
      0.025153
      -0.002081
    
    
      std
      0.995485
      0.990253
      1.003977
      0.989736
    
    
      min
      -3.000000
      -3.000000
      -3.000000
      -3.000000
    
    
      25%
      -0.774890
      -0.591841
      -0.641675
      -0.644144
    
    
      50%
      -0.116401
      0.101143
      0.002073
      -0.013611
    
    
      75%
      0.616366
      0.780282
      0.680391
      0.654328
    
    
      max
      3.000000
      2.653656
      3.000000
      3.000000

Permutation and random sampling



In [53]:

    
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler









    Out[53]:





array([1, 0, 2, 3, 4])



In [54]:

    
df



In [55]:

    
df.take(sampler)



In [56]:

    
df.take(np.random.permutation(len(df))[:3])



In [57]:

    
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size = 10)
sampler









    Out[57]:





array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])



In [59]:

    
draws = bag.take(sampler)
draws









    Out[59]:





array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

Computing indicator / dummy variables



In [60]:

    
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
df



In [64]:

    
dummies = pd.get_dummies(df['key'])
dummies



In [62]:

    
pd.get_dummies(df['key'], prefix='key_')



In [66]:

    
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy



In [68]:

    
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None,
                        names=mnames, engine = 'python')
movies[:10]









    Out[68]:






  
    
      
      movie_id
      title
      genres
    
  
  
    
      0
      1
      Toy Story (1995)
      Animation|Children's|Comedy
    
    
      1
      2
      Jumanji (1995)
      Adventure|Children's|Fantasy
    
    
      2
      3
      Grumpier Old Men (1995)
      Comedy|Romance
    
    
      3
      4
      Waiting to Exhale (1995)
      Comedy|Drama
    
    
      4
      5
      Father of the Bride Part II (1995)
      Comedy
    
    
      5
      6
      Heat (1995)
      Action|Crime|Thriller
    
    
      6
      7
      Sabrina (1995)
      Comedy|Romance
    
    
      7
      8
      Tom and Huck (1995)
      Adventure|Children's
    
    
      8
      9
      Sudden Death (1995)
      Action
    
    
      9
      10
      GoldenEye (1995)
      Action|Adventure|Thriller



In [73]:

    
genre_iter = (set(x.split('|')) for x in movies.genres)
print(type(genre_iter))
genres = sorted(set.union(*genre_iter))
genres









    



<class 'generator'>






    Out[73]:





['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']



In [76]:

    
dummies = DataFrame(np.zeros((len(movies), len(genres))).astype(np.int32), columns = genres)
dummies









    Out[76]:






  
    
      
      Action
      Adventure
      Animation
      Children's
      Comedy
      Crime
      Documentary
      Drama
      Fantasy
      Film-Noir
      Horror
      Musical
      Mystery
      Romance
      Sci-Fi
      Thriller
      War
      Western
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      20
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      21
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      22
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      23
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      24
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      25
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      26
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      27
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      28
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      29
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      3853
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3854
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3855
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3856
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3857
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3858
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3859
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3860
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3861
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3862
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3863
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3864
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3865
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3866
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3867
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3868
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3869
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3870
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3871
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3872
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3873
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3874
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3875
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3876
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3877
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3878
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3879
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3880
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3881
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3882
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3883 rows × 18 columns



In [78]:

    
for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1
dummies









    Out[78]:






  
    
      
      Action
      Adventure
      Animation
      Children's
      Comedy
      Crime
      Documentary
      Drama
      Fantasy
      Film-Noir
      Horror
      Musical
      Mystery
      Romance
      Sci-Fi
      Thriller
      War
      Western
    
  
  
    
      0
      0
      0
      1
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      1
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      6
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      7
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      10
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      11
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      0
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      15
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      16
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      17
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      18
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      20
      1
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      21
      0
      0
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      22
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      23
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      24
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      25
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      26
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      27
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      28
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      29
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      3853
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
    
    
      3854
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3855
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3856
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      3857
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      3858
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3859
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3860
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3861
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
    
    
      3862
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
    
    
      3863
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
    
    
      3864
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      3865
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3866
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      3867
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      1
      0
      0
    
    
      3868
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3869
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3870
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3871
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3872
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3873
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3874
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3875
      0
      1
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3876
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      3877
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      3878
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3879
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3880
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3881
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3882
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
  

3883 rows × 18 columns



In [79]:

    
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]









    Out[79]:





movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object



In [80]:

    
values = np.random.rand(10)
values









    Out[80]:





array([ 0.75603383,  0.90830844,  0.96588737,  0.17373658,  0.87592824,
        0.75415641,  0.163486  ,  0.23784062,  0.85564381,  0.58743194])



In [81]:

    
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))



In [ ]:

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

	(0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

	Action	Adventure	Animation	Children's	Comedy	Crime	Documentary	Drama	Fantasy	Film-Noir	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller	War	Western
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
11	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
13	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
14	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
15	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
16	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
17	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
18	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
19	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
20	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
21	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
22	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
23	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
24	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
25	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
26	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
27	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
28	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
29	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3853	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3854	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3855	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3856	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3857	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3858	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3859	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3860	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3861	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3862	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3863	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3864	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3865	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3866	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3867	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3868	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3869	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3870	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3871	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3872	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3873	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3874	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3875	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3876	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3877	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3878	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3879	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3880	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3881	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3882	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	(0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0

	(0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0

	(0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0