Chapter_07_Part_02

``````

In [1]:

import pandas as pd
import numpy as np
from pandas import DataFrame, Series

``````

Data transformation

Removing duplicates

``````

In [2]:

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
data

``````
``````

Out[2]:

k1
k2

0
one
1

1
one
1

2
one
2

3
two
3

4
two
3

5
two
4

6
two
4

``````
``````

In [3]:

data.duplicated()

``````
``````

Out[3]:

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

``````
``````

In [4]:

data.drop_duplicates()

``````
``````

Out[4]:

k1
k2

0
one
1

2
one
2

3
two
3

5
two
4

``````
``````

In [5]:

data['v1'] = np.arange(7)
data

``````
``````

Out[5]:

k1
k2
v1

0
one
1
0

1
one
1
1

2
one
2
2

3
two
3
3

4
two
3
4

5
two
4
5

6
two
4
6

``````
``````

In [6]:

data.drop_duplicates(['k1'])

``````
``````

Out[6]:

k1
k2
v1

0
one
1
0

3
two
3
3

``````
``````

In [9]:

data.drop_duplicates(['k1', 'k2'], keep = 'last')

``````
``````

Out[9]:

k1
k2
v1

1
one
1
1

2
one
2
2

4
two
3
4

6
two
4
6

``````

Transforming data using a function or mapping

``````

In [10]:

data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

``````
``````

Out[10]:

food
ounces

0
bacon
4.0

1
pulled pork
3.0

2
bacon
12.0

3
Pastrami
6.0

4
corned beef
7.5

5
Bacon
8.0

6
pastrami
3.0

7
honey ham
5.0

8
nova lox
6.0

``````
``````

In [11]:

meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

``````
``````

In [13]:

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

``````
``````

Out[13]:

food
ounces
animal

0
bacon
4.0
pig

1
pulled pork
3.0
pig

2
bacon
12.0
pig

3
Pastrami
6.0
cow

4
corned beef
7.5
cow

5
Bacon
8.0
pig

6
pastrami
3.0
cow

7
honey ham
5.0
pig

8
nova lox
6.0
salmon

``````
``````

In [14]:

data['food'].map(lambda x: meat_to_animal[x.lower()])

``````
``````

Out[14]:

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

``````

Replacing values

``````

In [15]:

data = Series([1., -999., 2., -999., -1000., 3.])
data

``````
``````

Out[15]:

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

``````
``````

In [16]:

data.replace(-999, np.nan)

``````
``````

Out[16]:

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

``````
``````

In [17]:

data.replace([-999, -1000], np.nan)

``````
``````

Out[17]:

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

``````
``````

In [18]:

data.replace([-999, -1000], [np.nan, 0])

``````
``````

Out[18]:

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

``````
``````

In [19]:

data.replace({-999: np.nan, -1000: 0})

``````
``````

Out[19]:

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

``````

Renaming axis indexes

``````

In [20]:

data = DataFrame(np.arange(12).reshape((3, 4)),
columns=['one', 'two', 'three', 'four'])

``````
``````

In [21]:

data.index.map(str.upper)

``````
``````

Out[21]:

``````
``````

In [22]:

data

``````
``````

Out[22]:

one
two
three
four

Ohio
0
1
2
3

4
5
6
7

New York
8
9
10
11

``````
``````

In [23]:

data.index = data.index.map(str.upper)
data

``````
``````

Out[23]:

one
two
three
four

OHIO
0
1
2
3

4
5
6
7

NEW YORK
8
9
10
11

``````
``````

In [25]:

data.rename(index = str.title, columns = str.upper)

``````
``````

Out[25]:

ONE
TWO
THREE
FOUR

Ohio
0
1
2
3

4
5
6
7

New York
8
9
10
11

``````
``````

In [26]:

data

``````
``````

Out[26]:

one
two
three
four

OHIO
0
1
2
3

4
5
6
7

NEW YORK
8
9
10
11

``````
``````

In [27]:

data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})

``````
``````

Out[27]:

one
two
peekaboo
four

INDIANA
0
1
2
3

4
5
6
7

NEW YORK
8
9
10
11

``````
``````

In [28]:

_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

``````
``````

Out[28]:

one
two
three
four

INDIANA
0
1
2
3

4
5
6
7

NEW YORK
8
9
10
11

``````

Discretization and binning

``````

In [30]:

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

``````
``````

Out[30]:

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

``````
``````

In [32]:

cats.codes

``````
``````

Out[32]:

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

``````
``````

In [36]:

cats.value_counts()

``````
``````

Out[36]:

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

``````
``````

In [37]:

pd.cut(ages, bins, right = False)

``````
``````

Out[37]:

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, object): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

``````
``````

In [38]:

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

``````
``````

Out[38]:

Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

``````
``````

In [39]:

data = np.random.rand(20)
pd.cut(data, 4, precision = 2)

``````
``````

Out[39]:

[(0.064, 0.29], (0.74, 0.97], (0.29, 0.52], (0.29, 0.52], (0.74, 0.97], ..., (0.52, 0.74], (0.74, 0.97], (0.74, 0.97], (0.064, 0.29], (0.29, 0.52]]
Length: 20
Categories (4, object): [(0.064, 0.29] < (0.29, 0.52] < (0.52, 0.74] < (0.74, 0.97]]

``````
``````

In [41]:

pd.cut(data, 4, precision = 2).value_counts()

``````
``````

Out[41]:

(0.064, 0.29]    6
(0.29, 0.52]     5
(0.52, 0.74]     2
(0.74, 0.97]     7
dtype: int64

``````
``````

In [42]:

data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats

``````
``````

Out[42]:

[[-3.568, -0.667], [-3.568, -0.667], (0.687, 2.922], (-0.667, -0.0231], (-0.667, -0.0231], ..., (-0.0231, 0.687], [-3.568, -0.667], (-0.667, -0.0231], (-0.667, -0.0231], [-3.568, -0.667]]
Length: 1000
Categories (4, object): [[-3.568, -0.667] < (-0.667, -0.0231] < (-0.0231, 0.687] < (0.687, 2.922]]

``````
``````

In [43]:

cats.value_counts()

``````
``````

Out[43]:

[-3.568, -0.667]     250
(-0.667, -0.0231]    250
(-0.0231, 0.687]     250
(0.687, 2.922]       250
dtype: int64

``````
``````

In [44]:

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

``````
``````

Out[44]:

[(-1.277, -0.0231], [-3.568, -1.277], (1.303, 2.922], (-1.277, -0.0231], (-1.277, -0.0231], ..., (-0.0231, 1.303], (-1.277, -0.0231], (-1.277, -0.0231], (-1.277, -0.0231], [-3.568, -1.277]]
Length: 1000
Categories (4, object): [[-3.568, -1.277] < (-1.277, -0.0231] < (-0.0231, 1.303] < (1.303, 2.922]]

``````

Detecting and filtering outliers

``````

In [45]:

np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

``````
``````

Out[45]:

0
1
2
3

count
1000.000000
1000.000000
1000.000000
1000.000000

mean
-0.067684
0.067924
0.025598
-0.002298

std
0.998035
0.992106
1.006835
0.996794

min
-3.428254
-3.548824
-3.184377
-3.745356

25%
-0.774890
-0.591841
-0.641675
-0.644144

50%
-0.116401
0.101143
0.002073
-0.013611

75%
0.616366
0.780282
0.680391
0.654328

max
3.366626
2.653656
3.260383
3.927528

``````
``````

In [46]:

col = data[3]
col[np.abs(col) > 3]

``````
``````

Out[46]:

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

``````
``````

In [48]:

data[(np.abs(data) > 3).any(1)]

``````
``````

Out[48]:

0
1
2
3

5
-0.539741
0.476985
3.248944
-1.021228

97
-0.774363
0.552936
0.106061
3.927528

102
-0.655054
-0.565230
3.176873
0.959533

305
-2.315555
0.457246
-0.025907
-3.399312

324
0.050188
1.951312
3.260383
0.963301

400
0.146326
0.508391
-0.196713
-3.745356

499
-0.293333
-0.242459
-3.056990
1.918403

523
-3.428254
-0.296336
-0.439938
-0.867165

586
0.275144
1.179227
-3.184377
1.369891

808
-0.362528
-3.548824
1.553205
-2.186301

900
3.366626
-2.372214
0.851010
1.332846

``````
``````

In [52]:

data = np.where(np.abs(data) > 3, np.sign(data) * 3, data)
data = DataFrame(data)
data.describe()

``````
``````

Out[52]:

0
1
2
3

count
1000.000000
1000.000000
1000.000000
1000.000000

mean
-0.067623
0.068473
0.025153
-0.002081

std
0.995485
0.990253
1.003977
0.989736

min
-3.000000
-3.000000
-3.000000
-3.000000

25%
-0.774890
-0.591841
-0.641675
-0.644144

50%
-0.116401
0.101143
0.002073
-0.013611

75%
0.616366
0.780282
0.680391
0.654328

max
3.000000
2.653656
3.000000
3.000000

``````

Permutation and random sampling

``````

In [53]:

df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

``````
``````

Out[53]:

array([1, 0, 2, 3, 4])

``````
``````

In [54]:

df

``````
``````

Out[54]:

0
1
2
3

0
0
1
2
3

1
4
5
6
7

2
8
9
10
11

3
12
13
14
15

4
16
17
18
19

``````
``````

In [55]:

df.take(sampler)

``````
``````

Out[55]:

0
1
2
3

1
4
5
6
7

0
0
1
2
3

2
8
9
10
11

3
12
13
14
15

4
16
17
18
19

``````
``````

In [56]:

df.take(np.random.permutation(len(df))[:3])

``````
``````

Out[56]:

0
1
2
3

1
4
5
6
7

3
12
13
14
15

4
16
17
18
19

``````
``````

In [57]:

bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size = 10)
sampler

``````
``````

Out[57]:

array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])

``````
``````

In [59]:

draws = bag.take(sampler)
draws

``````
``````

Out[59]:

array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

``````

Computing indicator / dummy variables

``````

In [60]:

df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df

``````
``````

Out[60]:

data1
key

0
0
b

1
1
b

2
2
a

3
3
c

4
4
a

5
5
b

``````
``````

In [64]:

dummies = pd.get_dummies(df['key'])
dummies

``````
``````

Out[64]:

a
b
c

0
0
1
0

1
0
1
0

2
1
0
0

3
0
0
1

4
1
0
0

5
0
1
0

``````
``````

In [62]:

pd.get_dummies(df['key'], prefix='key_')

``````
``````

Out[62]:

key__a
key__b
key__c

0
0
1
0

1
0
1
0

2
1
0
0

3
0
0
1

4
1
0
0

5
0
1
0

``````
``````

In [66]:

df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

``````
``````

Out[66]:

data1
a
b
c

0
0
0
1
0

1
1
0
1
0

2
2
1
0
0

3
3
0
0
1

4
4
1
0
0

5
5
0
1
0

``````
``````

In [68]:

mnames = ['movie_id', 'title', 'genres']
names=mnames, engine = 'python')
movies[:10]

``````
``````

Out[68]:

movie_id
title
genres

0
1
Toy Story (1995)
Animation|Children's|Comedy

1
2
Jumanji (1995)

2
3
Grumpier Old Men (1995)
Comedy|Romance

3
4
Waiting to Exhale (1995)
Comedy|Drama

4
5
Father of the Bride Part II (1995)
Comedy

5
6
Heat (1995)
Action|Crime|Thriller

6
7
Sabrina (1995)
Comedy|Romance

7
8
Tom and Huck (1995)

8
9
Sudden Death (1995)
Action

9
10
GoldenEye (1995)

``````
``````

In [73]:

genre_iter = (set(x.split('|')) for x in movies.genres)
print(type(genre_iter))
genres = sorted(set.union(*genre_iter))
genres

``````
``````

<class 'generator'>

Out[73]:

['Action',
'Animation',
"Children's",
'Comedy',
'Crime',
'Documentary',
'Drama',
'Fantasy',
'Film-Noir',
'Horror',
'Musical',
'Mystery',
'Romance',
'Sci-Fi',
'Thriller',
'War',
'Western']

``````
``````

In [76]:

dummies = DataFrame(np.zeros((len(movies), len(genres))).astype(np.int32), columns = genres)
dummies

``````
``````

Out[76]:

Action
Animation
Children's
Comedy
Crime
Documentary
Drama
Fantasy
Film-Noir
Horror
Musical
Mystery
Romance
Sci-Fi
Thriller
War
Western

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

5
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

6
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

7
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

8
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

9
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

10
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

11
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

12
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

13
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

14
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

15
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

16
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

17
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

18
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

19
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

20
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

21
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

22
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

23
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

24
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

25
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

26
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

27
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

28
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

29
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...

3853
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3854
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3855
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3856
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3857
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3858
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3859
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3860
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3861
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3862
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3863
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3864
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3865
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3866
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3867
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3868
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3869
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3870
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3871
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3872
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3873
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3874
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3875
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3876
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3877
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3878
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3879
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3880
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3881
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3882
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3883 rows × 18 columns

``````
``````

In [78]:

for i, gen in enumerate(movies.genres):
dummies.ix[i, gen.split('|')] = 1
dummies

``````
``````

Out[78]:

Action
Animation
Children's
Comedy
Crime
Documentary
Drama
Fantasy
Film-Noir
Horror
Musical
Mystery
Romance
Sci-Fi
Thriller
War
Western

0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0

1
0
1
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0

2
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0

3
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0

4
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

5
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0

6
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0

7
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0

8
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

9
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0

10
0
0
0
0
1
0
0
1
0
0
0
0
0
1
0
0
0
0

11
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0

12
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0

13
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

14
1
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0

15
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0

16
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0

17
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0

18
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

19
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

20
1
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0

21
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
1
0
0

22
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0

23
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0

24
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0

25
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

26
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

27
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0

28
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0

29
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...

3853
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0

3854
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

3855
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

3856
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0

3857
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0

3858
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0

3859
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

3860
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3861
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0

3862
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0

3863
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0

3864
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0

3865
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3866
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0

3867
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0

3868
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3869
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3870
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3871
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3872
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0

3873
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

3874
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0

3875
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0

3876
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0

3877
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0

3878
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0

3879
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

3880
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

3881
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0

3882
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0

3883 rows × 18 columns

``````
``````

In [79]:

movies_windic.ix[0]

``````
``````

Out[79]:

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object

``````
``````

In [80]:

values = np.random.rand(10)
values

``````
``````

Out[80]:

array([ 0.75603383,  0.90830844,  0.96588737,  0.17373658,  0.87592824,
0.75415641,  0.163486  ,  0.23784062,  0.85564381,  0.58743194])

``````
``````

In [81]:

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

``````
``````

Out[81]:

(0, 0.2]
(0.2, 0.4]
(0.4, 0.6]
(0.6, 0.8]
(0.8, 1]

0
0
0
0
1
0

1
0
0
0
0
1

2
0
0
0
0
1

3
1
0
0
0
0

4
0
0
0
0
1

5
0
0
0
1
0

6
1
0
0
0
0

7
0
1
0
0
0

8
0
0
0
0
1

9
0
0
1
0
0

``````
``````

In [ ]:

``````