DataFrame part 1

Introduction to the library.


In [6]:
import pandas as pd

In [7]:
nba = pd.read_csv('data/nba.csv')

In [8]:
nba.head()


Out[8]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University NaN
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0
4 Jonas Jerebko Boston Celtics 8.0 PF 29.0 6-10 231.0 NaN 5000000.0

In [8]:
nba.tail()


Out[8]:
Name Team Number Position Age Height Weight College Salary
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
454 Raul Neto Utah Jazz 25.0 PG 24.0 6-1 179.0 NaN 900000.0
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 NaN 2900000.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0
457 NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [9]:
nba.index


Out[9]:
RangeIndex(start=0, stop=458, step=1)

In [10]:
nba.values


Out[10]:
array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ..., 
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [12]:
# Rows , Columns
nba.shape


Out[12]:
(458, 9)

In [14]:
# dtypes for each column
nba.dtypes


Out[14]:
Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [16]:
nba.columns


Out[16]:
Index([u'Name', u'Team', u'Number', u'Position', u'Age', u'Height', u'Weight',
       u'College', u'Salary'],
      dtype='object')

In [18]:
# Index / Column
nba.axes


Out[18]:
[RangeIndex(start=0, stop=458, step=1),
 Index([u'Name', u'Team', u'Number', u'Position', u'Age', u'Height', u'Weight',
        u'College', u'Salary'],
       dtype='object')]

In [20]:
# VERY IMPORTANT METHOD!
nba.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB

In [21]:
nba.get_dtype_counts()


Out[21]:
float64    4
object     5
dtype: int64

Set Index on read_csv()


In [24]:
revenue = pd.read_csv('data/revenue.csv', index_col='Date')

In [25]:
revenue


Out[25]:
New York Los Angeles Miami
Date
1/1/16 985 122 499
1/2/16 738 788 534
1/3/16 14 20 933
1/4/16 730 904 885
1/5/16 114 71 253
1/6/16 936 502 497
1/7/16 123 996 115
1/8/16 935 492 886
1/9/16 846 954 823
1/10/16 54 285 216

In [27]:
revenue.index


Out[27]:
Index([u'1/1/16', u'1/2/16', u'1/3/16', u'1/4/16', u'1/5/16', u'1/6/16',
       u'1/7/16', u'1/8/16', u'1/9/16', u'1/10/16'],
      dtype='object', name=u'Date')

Sum by Index / Values


In [32]:
revenue.sum()


Out[32]:
New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [34]:
# Sum columns values
revenue.sum(axis='columns')
#revenue.sum(axis=1)


Out[34]:
Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

Select columns


In [36]:
nba.head(3)


Out[36]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University NaN

In [46]:
# One Column
nba['Name'].head(3)
# nba.Name BAD IDEA!!!!!!


Out[46]:
0    Avery Bradley
1      Jae Crowder
2     John Holland
Name: Name, dtype: object

In [45]:
# Multiple Columns
nba[['Name', 'Team', 'Number']].head(3)


Out[45]:
Name Team Number
0 Avery Bradley Boston Celtics 0.0
1 Jae Crowder Boston Celtics 99.0
2 John Holland Boston Celtics 30.0

In [57]:
select = ["Salary", "Team", "Name"]
nba[select].head(3)


Out[57]:
Salary Team Name
0 7730337.0 Boston Celtics Avery Bradley
1 6796117.0 Boston Celtics Jae Crowder
2 NaN Boston Celtics John Holland

Add Columns


In [75]:
nba = pd.read_csv('data/nba.csv')
nba['TEST'] = 'Hello Test'

In [76]:
nba.head(3)


Out[76]:
Name Team Number Position Age Height Weight College Salary TEST
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0 Hello Test
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0 Hello Test
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University NaN Hello Test

In [77]:
nba = pd.read_csv('data/nba.csv')
nba.insert(0, column='TEST', value='Bla bla bla')
nba.insert(5, column='ASD', value='Bla bla bla')

In [79]:
nba.head(3)


Out[79]:
TEST Name Team Number Position ASD Age Height Weight College Salary
0 Bla bla bla Avery Bradley Boston Celtics 0.0 PG Bla bla bla 25.0 6-2 180.0 Texas 7730337.0
1 Bla bla bla Jae Crowder Boston Celtics 99.0 SF Bla bla bla 25.0 6-6 235.0 Marquette 6796117.0
2 Bla bla bla John Holland Boston Celtics 30.0 SG Bla bla bla 27.0 6-5 205.0 Boston University NaN

Broadcasting Operations


In [80]:
nba = pd.read_csv('data/nba.csv')

In [87]:
nba['Number'].add(10) # return Series, no changes on my original dataframe
# nba['Number'] + 10


Out[87]:
0       10.0
1      109.0
2       40.0
3       38.0
4       18.0
5      100.0
6       65.0
7       51.0
8       22.0
9       46.0
10      17.0
11      14.0
12      21.0
13      23.0
14      54.0
15      54.0
16      32.0
17      31.0
18      34.0
19      12.0
20      20.0
21      16.0
22      10.0
23      21.0
24      11.0
25      43.0
26      51.0
27      24.0
28      25.0
29      40.0
       ...  
428     18.0
429     15.0
430     33.0
431     27.0
432     14.0
433     19.0
434     45.0
435     21.0
436     10.0
437     13.0
438     54.0
439     34.0
440     12.0
441     31.0
442     43.0
443     13.0
444     20.0
445     21.0
446     25.0
447     37.0
448     30.0
449     15.0
450     12.0
451     33.0
452     51.0
453     18.0
454     35.0
455     31.0
456     34.0
457      NaN
Name: Number, Length: 458, dtype: float64

In [86]:
nba.head(3)


Out[86]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University NaN

In [88]:
nba['Number'].sub(10)
# nba['Number'] - 10


Out[88]:
0     -10.0
1      89.0
2      20.0
3      18.0
4      -2.0
5      80.0
6      45.0
7      31.0
8       2.0
9      26.0
10     -3.0
11     -6.0
12      1.0
13      3.0
14     34.0
15     34.0
16     12.0
17     11.0
18     14.0
19     -8.0
20      0.0
21     -4.0
22    -10.0
23      1.0
24     -9.0
25     23.0
26     31.0
27      4.0
28      5.0
29     20.0
       ... 
428    -2.0
429    -5.0
430    13.0
431     7.0
432    -6.0
433    -1.0
434    25.0
435     1.0
436   -10.0
437    -7.0
438    34.0
439    14.0
440    -8.0
441    11.0
442    23.0
443    -7.0
444     0.0
445     1.0
446     5.0
447    17.0
448    10.0
449    -5.0
450    -8.0
451    13.0
452    31.0
453    -2.0
454    15.0
455    11.0
456    14.0
457     NaN
Name: Number, Length: 458, dtype: float64

In [92]:
nba['Number'].mul(10)
# nba['Number'] * 10


Out[92]:
0        0.0
1      990.0
2      300.0
3      280.0
4       80.0
5      900.0
6      550.0
7      410.0
8      120.0
9      360.0
10      70.0
11      40.0
12     110.0
13     130.0
14     440.0
15     440.0
16     220.0
17     210.0
18     240.0
19      20.0
20     100.0
21      60.0
22       0.0
23     110.0
24      10.0
25     330.0
26     410.0
27     140.0
28     150.0
29     300.0
       ...  
428     80.0
429     50.0
430    230.0
431    170.0
432     40.0
433     90.0
434    350.0
435    110.0
436      0.0
437     30.0
438    440.0
439    240.0
440     20.0
441    210.0
442    330.0
443     30.0
444    100.0
445    110.0
446    150.0
447    270.0
448    200.0
449     50.0
450     20.0
451    230.0
452    410.0
453     80.0
454    250.0
455    210.0
456    240.0
457      NaN
Name: Number, Length: 458, dtype: float64

In [94]:
nba['Number'].div(10)
# nba['Number'] / 10


Out[94]:
0      0.0
1      9.9
2      3.0
3      2.8
4      0.8
5      9.0
6      5.5
7      4.1
8      1.2
9      3.6
10     0.7
11     0.4
12     1.1
13     1.3
14     4.4
15     4.4
16     2.2
17     2.1
18     2.4
19     0.2
20     1.0
21     0.6
22     0.0
23     1.1
24     0.1
25     3.3
26     4.1
27     1.4
28     1.5
29     3.0
      ... 
428    0.8
429    0.5
430    2.3
431    1.7
432    0.4
433    0.9
434    3.5
435    1.1
436    0.0
437    0.3
438    4.4
439    2.4
440    0.2
441    2.1
442    3.3
443    0.3
444    1.0
445    1.1
446    1.5
447    2.7
448    2.0
449    0.5
450    0.2
451    2.3
452    4.1
453    0.8
454    2.5
455    2.1
456    2.4
457    NaN
Name: Number, Length: 458, dtype: float64

value_counts()


In [95]:
nba.head(3)


Out[95]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University NaN

In [97]:
# Frequency for each values
nba['Team'].value_counts()


Out[97]:
New Orleans Pelicans      19
Memphis Grizzlies         18
Milwaukee Bucks           16
New York Knicks           16
Denver Nuggets            15
Charlotte Hornets         15
Los Angeles Lakers        15
Chicago Bulls             15
San Antonio Spurs         15
Philadelphia 76ers        15
Toronto Raptors           15
Detroit Pistons           15
Boston Celtics            15
Miami Heat                15
Utah Jazz                 15
Portland Trail Blazers    15
Los Angeles Clippers      15
Phoenix Suns              15
Golden State Warriors     15
Cleveland Cavaliers       15
Washington Wizards        15
Atlanta Hawks             15
Brooklyn Nets             15
Houston Rockets           15
Dallas Mavericks          15
Sacramento Kings          15
Indiana Pacers            15
Oklahoma City Thunder     15
Orlando Magic             14
Minnesota Timberwolves    14
Name: Team, dtype: int64

Drops NULL Values - dropna()


In [98]:
nba.tail(3)


Out[98]:
Name Team Number Position Age Height Weight College Salary
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 NaN 2900000.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0
457 NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [106]:
# removes any rows with at least one NULL Value
nba.dropna().tail(3)


Out[106]:
Name Team Number Position Age Height Weight College Salary
452 Trey Lyles Utah Jazz 41.0 PF 20.0 6-10 234.0 Kentucky 2239800.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0

In [110]:
# Remove only if ALL VALUES ARE NULL
nba.dropna(how='all', inplace=True)
nba.tail(3)


Out[110]:
Name Team Number Position Age Height Weight College Salary
454 Raul Neto Utah Jazz 25.0 PG 24.0 6-1 179.0 NaN 900000.0
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 NaN 2900000.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0

In [113]:
# Remove only if a subset of column has at least one NULL value
nba.dropna(subset = ["Salary", "College"]).head(3)


Out[113]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0

In [114]:
nba.dropna(subset = ["Salary", "College"]).tail(3)


Out[114]:
Name Team Number Position Age Height Weight College Salary
452 Trey Lyles Utah Jazz 41.0 PF 20.0 6-10 234.0 Kentucky 2239800.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0

Fill Null Values fillna()


In [115]:
nba = pd.read_csv('data/nba.csv')

In [116]:
nba.fillna(0)


Out[116]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University 0.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0
4 Jonas Jerebko Boston Celtics 8.0 PF 29.0 6-10 231.0 0 5000000.0
5 Amir Johnson Boston Celtics 90.0 PF 29.0 6-9 240.0 0 12000000.0
6 Jordan Mickey Boston Celtics 55.0 PF 21.0 6-8 235.0 LSU 1170960.0
7 Kelly Olynyk Boston Celtics 41.0 C 25.0 7-0 238.0 Gonzaga 2165160.0
8 Terry Rozier Boston Celtics 12.0 PG 22.0 6-2 190.0 Louisville 1824360.0
9 Marcus Smart Boston Celtics 36.0 PG 22.0 6-4 220.0 Oklahoma State 3431040.0
10 Jared Sullinger Boston Celtics 7.0 C 24.0 6-9 260.0 Ohio State 2569260.0
11 Isaiah Thomas Boston Celtics 4.0 PG 27.0 5-9 185.0 Washington 6912869.0
12 Evan Turner Boston Celtics 11.0 SG 27.0 6-7 220.0 Ohio State 3425510.0
13 James Young Boston Celtics 13.0 SG 20.0 6-6 215.0 Kentucky 1749840.0
14 Tyler Zeller Boston Celtics 44.0 C 26.0 7-0 253.0 North Carolina 2616975.0
15 Bojan Bogdanovic Brooklyn Nets 44.0 SG 27.0 6-8 216.0 0 3425510.0
16 Markel Brown Brooklyn Nets 22.0 SG 24.0 6-3 190.0 Oklahoma State 845059.0
17 Wayne Ellington Brooklyn Nets 21.0 SG 28.0 6-4 200.0 North Carolina 1500000.0
18 Rondae Hollis-Jefferson Brooklyn Nets 24.0 SG 21.0 6-7 220.0 Arizona 1335480.0
19 Jarrett Jack Brooklyn Nets 2.0 PG 32.0 6-3 200.0 Georgia Tech 6300000.0
20 Sergey Karasev Brooklyn Nets 10.0 SG 22.0 6-7 208.0 0 1599840.0
21 Sean Kilpatrick Brooklyn Nets 6.0 SG 26.0 6-4 219.0 Cincinnati 134215.0
22 Shane Larkin Brooklyn Nets 0.0 PG 23.0 5-11 175.0 Miami (FL) 1500000.0
23 Brook Lopez Brooklyn Nets 11.0 C 28.0 7-0 275.0 Stanford 19689000.0
24 Chris McCullough Brooklyn Nets 1.0 PF 21.0 6-11 200.0 Syracuse 1140240.0
25 Willie Reed Brooklyn Nets 33.0 PF 26.0 6-10 220.0 Saint Louis 947276.0
26 Thomas Robinson Brooklyn Nets 41.0 PF 25.0 6-10 237.0 Kansas 981348.0
27 Henry Sims Brooklyn Nets 14.0 C 26.0 6-10 248.0 Georgetown 947276.0
28 Donald Sloan Brooklyn Nets 15.0 PG 28.0 6-3 205.0 Texas A&M 947276.0
29 Thaddeus Young Brooklyn Nets 30.0 PF 27.0 6-8 221.0 Georgia Tech 11235955.0
... ... ... ... ... ... ... ... ... ...
428 Al-Farouq Aminu Portland Trail Blazers 8.0 SF 25.0 6-9 215.0 Wake Forest 8042895.0
429 Pat Connaughton Portland Trail Blazers 5.0 SG 23.0 6-5 206.0 Notre Dame 625093.0
430 Allen Crabbe Portland Trail Blazers 23.0 SG 24.0 6-6 210.0 California 947276.0
431 Ed Davis Portland Trail Blazers 17.0 C 27.0 6-10 240.0 North Carolina 6980802.0
432 Maurice Harkless Portland Trail Blazers 4.0 SF 23.0 6-9 215.0 St. John's 2894059.0
433 Gerald Henderson Portland Trail Blazers 9.0 SG 28.0 6-5 215.0 Duke 6000000.0
434 Chris Kaman Portland Trail Blazers 35.0 C 34.0 7-0 265.0 Central Michigan 5016000.0
435 Meyers Leonard Portland Trail Blazers 11.0 PF 24.0 7-1 245.0 Illinois 3075880.0
436 Damian Lillard Portland Trail Blazers 0.0 PG 25.0 6-3 195.0 Weber State 4236287.0
437 C.J. McCollum Portland Trail Blazers 3.0 SG 24.0 6-4 200.0 Lehigh 2525160.0
438 Luis Montero Portland Trail Blazers 44.0 SG 23.0 6-7 185.0 Westchester CC 525093.0
439 Mason Plumlee Portland Trail Blazers 24.0 C 26.0 6-11 235.0 Duke 1415520.0
440 Brian Roberts Portland Trail Blazers 2.0 PG 30.0 6-1 173.0 Dayton 2854940.0
441 Noah Vonleh Portland Trail Blazers 21.0 PF 20.0 6-9 240.0 Indiana 2637720.0
442 Trevor Booker Utah Jazz 33.0 PF 28.0 6-8 228.0 Clemson 4775000.0
443 Trey Burke Utah Jazz 3.0 PG 23.0 6-1 191.0 Michigan 2658240.0
444 Alec Burks Utah Jazz 10.0 SG 24.0 6-6 214.0 Colorado 9463484.0
445 Dante Exum Utah Jazz 11.0 PG 20.0 6-6 190.0 0 3777720.0
446 Derrick Favors Utah Jazz 15.0 PF 24.0 6-10 265.0 Georgia Tech 12000000.0
447 Rudy Gobert Utah Jazz 27.0 C 23.0 7-1 245.0 0 1175880.0
448 Gordon Hayward Utah Jazz 20.0 SF 26.0 6-8 226.0 Butler 15409570.0
449 Rodney Hood Utah Jazz 5.0 SG 23.0 6-8 206.0 Duke 1348440.0
450 Joe Ingles Utah Jazz 2.0 SF 28.0 6-8 226.0 0 2050000.0
451 Chris Johnson Utah Jazz 23.0 SF 26.0 6-6 206.0 Dayton 981348.0
452 Trey Lyles Utah Jazz 41.0 PF 20.0 6-10 234.0 Kentucky 2239800.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
454 Raul Neto Utah Jazz 25.0 PG 24.0 6-1 179.0 0 900000.0
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 0 2900000.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0
457 0 0 0.0 0 0.0 0 0.0 0 0.0

458 rows × 9 columns


In [118]:
nba = pd.read_csv('data/nba.csv')

In [120]:
nba['College'].fillna("No College", inplace=True)

In [121]:
nba['Salary'].fillna(-1, inplace=True)

In [122]:
nba.tail()


Out[122]:
Name Team Number Position Age Height Weight College Salary
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
454 Raul Neto Utah Jazz 25.0 PG 24.0 6-1 179.0 No College 900000.0
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 No College 2900000.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0
457 NaN NaN NaN NaN NaN NaN NaN No College -1.0

In [123]:
nba.head()


Out[123]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University -1.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0
4 Jonas Jerebko Boston Celtics 8.0 PF 29.0 6-10 231.0 No College 5000000.0

AsType


In [142]:
nba = pd.read_csv('data/nba.csv')
nba.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB

In [143]:
# Remove All NULL Values
nba.dropna(how='all', inplace=True)
nba.tail()


Out[143]:
Name Team Number Position Age Height Weight College Salary
452 Trey Lyles Utah Jazz 41.0 PF 20.0 6-10 234.0 Kentucky 2239800.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0
454 Raul Neto Utah Jazz 25.0 PG 24.0 6-1 179.0 NaN 900000.0
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 NaN 2900000.0
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0

In [144]:
# Fix Null Values Manually
nba['Salary'].fillna(0, inplace=True)
nba['College'].fillna('No College', inplace=True)

In [145]:
nba.head(5)


Out[145]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337.0
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117.0
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University 0.0
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640.0
4 Jonas Jerebko Boston Celtics 8.0 PF 29.0 6-10 231.0 No College 5000000.0

In [147]:
# Ready to start!
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB

In [148]:
nba.dtypes


Out[148]:
Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [150]:
## Convert Salary to Integer
nba['Salary'] = nba['Salary'].astype('int')

In [152]:
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 55.7+ KB

In [153]:
nba['Number'] = nba['Number'].astype('int')
nba['Age'] = nba['Age'].astype('int')

In [154]:
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null int64
Position    457 non-null object
Age         457 non-null int64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: float64(1), int64(3), object(5)
memory usage: 55.7+ KB

Unique Values


In [156]:
nba['Position'].nunique()


Out[156]:
5

Save Memory using Category


In [157]:
nba['Position'] = nba['Position'].astype('category')

In [158]:
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null int64
Position    457 non-null category
Age         457 non-null int64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: category(1), float64(1), int64(3), object(4)
memory usage: 52.8+ KB

In [159]:
nba['Team'].nunique()


Out[159]:
30

In [160]:
nba['Team'] = nba['Team'].astype('category')

In [161]:
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null category
Number      457 non-null int64
Position    457 non-null category
Age         457 non-null int64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: category(2), float64(1), int64(3), object(3)
memory usage: 51.1+ KB

In [162]:
nba.head()


Out[162]:
Name Team Number Position Age Height Weight College Salary
0 Avery Bradley Boston Celtics 0 PG 25 6-2 180.0 Texas 7730337
1 Jae Crowder Boston Celtics 99 SF 25 6-6 235.0 Marquette 6796117
2 John Holland Boston Celtics 30 SG 27 6-5 205.0 Boston University 0
3 R.J. Hunter Boston Celtics 28 SG 22 6-5 185.0 Georgia State 1148640
4 Jonas Jerebko Boston Celtics 8 PF 29 6-10 231.0 No College 5000000

Sorting Values


In [180]:
nba = pd.read_csv('data/nba.csv')
nba.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB

In [166]:
nba.sort_values('Age', ascending=True).head(3)


Out[166]:
Name Team Number Position Age Height Weight College Salary
226 Rashad Vaughn Milwaukee Bucks 20.0 SG 19.0 6-6 202.0 UNLV 1733040.0
122 Devin Booker Phoenix Suns 1.0 SG 19.0 6-6 206.0 Kentucky 2127840.0
40 Kristaps Porzingis New York Knicks 6.0 PF 20.0 7-3 240.0 NaN 4131720.0

In [167]:
nba.sort_values('Salary', ascending=False, inplace=True)

In [170]:
nba.head(3)


Out[170]:
Name Team Number Position Age Height Weight College Salary
109 Kobe Bryant Los Angeles Lakers 24.0 SF 37.0 6-6 212.0 NaN 25000000.0
169 LeBron James Cleveland Cavaliers 23.0 SF 31.0 6-8 250.0 NaN 22970500.0
33 Carmelo Anthony New York Knicks 7.0 SF 32.0 6-8 240.0 Syracuse 22875000.0

In [173]:
nba.tail()


Out[173]:
Name Team Number Position Age Height Weight College Salary
264 Jordan Farmar Memphis Grizzlies 4.0 PG 29.0 6-2 180.0 UCLA NaN
353 Dorell Wright Miami Heat 11.0 SF 30.0 6-9 205.0 NaN NaN
171 Dahntay Jones Cleveland Cavaliers 30.0 SG 35.0 6-6 225.0 Duke NaN
46 Elton Brand Philadelphia 76ers 42.0 PF 37.0 6-9 254.0 Duke NaN
457 NaN NaN NaN NaN NaN NaN NaN NaN NaN

Sorting Values by Multiple Columns


In [171]:
nba.sort_values(['Salary', 'Age'], ascending=[False, True], inplace=True)

In [172]:
nba.head()


Out[172]:
Name Team Number Position Age Height Weight College Salary
109 Kobe Bryant Los Angeles Lakers 24.0 SF 37.0 6-6 212.0 NaN 25000000.0
169 LeBron James Cleveland Cavaliers 23.0 SF 31.0 6-8 250.0 NaN 22970500.0
33 Carmelo Anthony New York Knicks 7.0 SF 32.0 6-8 240.0 Syracuse 22875000.0
251 Dwight Howard Houston Rockets 12.0 C 30.0 6-11 265.0 NaN 22359364.0
339 Chris Bosh Miami Heat 1.0 PF 32.0 6-11 235.0 Georgia Tech 22192730.0

In [187]:
nba.sort_values(['Team', 'Salary'], ascending=[True, False], inplace=True)

In [188]:
nba.head()


Out[188]:
Name Team Number Position Age Height Weight College Salary
315 Paul Millsap Atlanta Hawks 4.0 PF 31.0 6-8 246.0 Louisiana Tech 18671659.0
312 Al Horford Atlanta Hawks 15.0 C 30.0 6-10 245.0 Florida 12000000.0
321 Tiago Splitter Atlanta Hawks 11.0 C 31.0 6-11 245.0 NaN 9756250.0
323 Jeff Teague Atlanta Hawks 0.0 PG 27.0 6-2 186.0 Wake Forest 8000000.0
314 Kyle Korver Atlanta Hawks 26.0 SG 35.0 6-7 212.0 Creighton 5746479.0

Sort Index


In [196]:
nba = pd.read_csv('data/nba.csv')
nba.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB

In [197]:
nba.sort_index(ascending=False, inplace=True)

In [198]:
nba.head()


Out[198]:
Name Team Number Position Age Height Weight College Salary
457 NaN NaN NaN NaN NaN NaN NaN NaN NaN
456 Jeff Withey Utah Jazz 24.0 C 26.0 7-0 231.0 Kansas 947276.0
455 Tibor Pleiss Utah Jazz 21.0 C 26.0 7-3 256.0 NaN 2900000.0
454 Raul Neto Utah Jazz 25.0 PG 24.0 6-1 179.0 NaN 900000.0
453 Shelvin Mack Utah Jazz 8.0 PG 26.0 6-3 203.0 Butler 2433333.0

Rank


In [203]:
nba = pd.read_csv('data/nba.csv')
nba['Salary'] = nba['Salary'].fillna(0).astype('int')
nba.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      458 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 32.3+ KB

In [208]:
nba['Salary Rank'] = nba['Salary'].rank(ascending=False).astype('int')

In [209]:
nba.head()


Out[209]:
Name Team Number Position Age Height Weight College Salary Salary Rank
0 Avery Bradley Boston Celtics 0.0 PG 25.0 6-2 180.0 Texas 7730337 97
1 Jae Crowder Boston Celtics 99.0 SF 25.0 6-6 235.0 Marquette 6796117 110
2 John Holland Boston Celtics 30.0 SG 27.0 6-5 205.0 Boston University 0 452
3 R.J. Hunter Boston Celtics 28.0 SG 22.0 6-5 185.0 Georgia State 1148640 322
4 Jonas Jerebko Boston Celtics 8.0 PF 29.0 6-10 231.0 NaN 5000000 147

In [212]:
nba.sort_values('Salary', ascending=False).head()
# nba.sort_values('Salary Rank', ascending=True).head()


Out[212]:
Name Team Number Position Age Height Weight College Salary Salary Rank
109 Kobe Bryant Los Angeles Lakers 24.0 SF 37.0 6-6 212.0 NaN 25000000 1
169 LeBron James Cleveland Cavaliers 23.0 SF 31.0 6-8 250.0 NaN 22970500 2
33 Carmelo Anthony New York Knicks 7.0 SF 32.0 6-8 240.0 Syracuse 22875000 3
251 Dwight Howard Houston Rockets 12.0 C 30.0 6-11 265.0 NaN 22359364 4
339 Chris Bosh Miami Heat 1.0 PF 32.0 6-11 235.0 Georgia Tech 22192730 5

In [ ]: