In [1]:
#This is a comment
#This is all blackboxed for now--DON'T worry about it
# Render our plots inline
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)
In [ ]:
Rk,G,Date,Age,Tm,,Opp,,GS,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
1,1,2013-10-29,28-303,MIA,,CHI,W (+12),1,38:01,5,11,.455,0,1,.000,7,9,.778,0,6,6,8,1,0,2,0,17,16.9,+8
2,2,2013-10-30,28-304,MIA,@,PHI,L (-4),1,36:38,9,17,.529,4,7,.571,3,4,.750,0,4,4,13,0,0,4,3,25,21.4,-8
3,3,2013-11-01,28-306,MIA,@,BRK,L (-1),1,42:14,11,19,.579,1,2,.500,3,5,.600,1,6,7,6,2,1,5,2,26,19.9,-3
4,4,2013-11-03,28-308,MIA,,WAS,W (+10),1,34:41,9,14,.643,3,5,.600,4,5,.800,0,3,3,5,1,0,6,2,25,17.0,+16
5,5,2013-11-05,28-310,MIA,@,TOR,W (+9),1,36:01,13,20,.650,1,3,.333,8,8,1.000,2,6,8,8,0,1,1,2,35,33.9,+3
In [ ]:
#looks much nicer on a wide screen!
In [ ]:
In [ ]:
In [3]:
import csv
import urllib
url = "https://gist.githubusercontent.com/aparrish/cb1672e98057ea2ab7a1/raw/13166792e0e8436221ef85d2a655f1965c400f75/lebron_james.csv"
stats = list(csv.reader(urllib.urlopen(url)))
#example courtesy the great Allison Parrish!
#What different things do urllib.urlopen(url) then csv.reader() and then list() do?
In [4]:
stats[0]
Out[4]:
In [5]:
len(stats)
Out[5]:
In [6]:
stats[74][0]
Out[6]:
You can compose indexes! this is the 0th item of the 74th list.
BUT I'm not going to torture you with this lower level analysis (for now)
The library Pandas
provides us with a powerful overlay that lets us use matrices but always keep their row and column names: a spreadsheet on speed. It allows us to work directly with the datatype "Dataframes" that keeps track of values and their names for us. And it allows us to perform many operations on slices of the dataframe without having to run for
loops and the like. This is more convenient and involves faster processing.
In [1]:
import pandas as pd #we've already done this but just to remind you you'll need to
In [8]:
#Let's start with yet another way to read csv files, this time from `pandas`
import os
directory=("/Users/mljones/repositories/comp_in_context_trial/")
os.chdir(directory)
Now we read a big csv file using a function from pandas
called pd.read_csv()
In [9]:
df=pd.read_csv('HMXPC_13.csv', sep=",")
In [10]:
df
Out[10]:
Note at the bottom that the display tells us how many rows and columns we're dealing with.
As a general rule, pandas dataframe objects default to slicing by column using a syntax you'll know from dicts
as in df["course_id"].
In [11]:
df["course_id"]
Out[11]:
In [12]:
df["course_id"][3340:3350] #pick out a list of values from ONE column
Out[12]:
Instead of (column, row) we use name_of_dataframe[column name][row #]
In [13]:
df[3340:3350] # SLICE a list of ROWS
Out[13]:
In [14]:
#This was _not_ in class PREPARE FOR TERRIBLE ERROR!
#THIS DOESN'T WORK
df[3340]
In [15]:
#That's icky.
#to pick out one row use `.ix`
df.ix[3340]
Out[15]:
Why? A good question. Now try passing a list of just one row:
In [17]:
df.ix[[3340]]
Out[17]:
We can pick out columns using their names and with a slice of rows.
In [19]:
df['final_cc_cname_DI'][100:110]
Out[19]:
In [18]:
df.dtypes
Out[18]:
In inputing CSV, Pandas parses each column and attempts to discern what sort of data is within. It's good but not infallible.
In [19]:
df=pd.read_csv('HMXPC_13.csv', sep="," , parse_dates=['start_time_DI', 'last_event_DI'])
note that we pass a list of columns to pick out multiple columns
In [27]:
df["start_time_DI"]
Out[27]:
Now we can count how many times someone started
In [28]:
startdates=df['start_time_DI'].value_counts()
# Exercise to the reader: how might you do this without using the `.value_counts()` method?
In [29]:
startdates
Out[29]:
In [25]:
startdates.plot()
Out[25]:
In [26]:
startdates.plot(title="I can't it's not butter.")
Out[26]:
What are
In [28]:
startdates.plot(kind="bar")
Out[28]:
In [29]:
#Ok, let's consider how many times different people played a video
df["nplay_video"].dropna().plot()
Out[29]:
In [ ]: