In [3]:
from pandas import Series
Series([1,2,3,4,5])
# Note the index on the left hand side when printed
Out[3]:
In [4]:
a = Series([1, 2, 3, 4, 5])
a.values, a.index # Notice how the values of a series is just an array and the Index is a special type
Out[4]:
In [24]:
# By default a Series doesn't have a name, but you can assign it one:
Series([1,2,3,4], name="My Series")
Out[24]:
In [5]:
# You can change the index if you want
indexed_series = Series([1,2,3,4,5], index=['a','b','c','d','e'])
indexed_series
Out[5]:
In [6]:
# You use the assigned index or a numeric index to access elements
indexed_series['c'], indexed_series[0]
# In other words, a Series is really just an ordered dict
Out[6]:
In [67]:
# In fact, you can create a series right from a python dict
myseries = Series({'Netherlands': 5, 'Belgium': 10, 'France': 21})
myseries
Out[67]:
In [68]:
# You can access it by number of by index, note that Belgium comes first as the dictionary is sorted by key
myseries[0], myseries['Netherlands']
Out[68]:
Like with regular numpy arrays, you can create boolean masks on Series
and use them to filter out certain values in the Series
, like so:
In [69]:
myseries[myseries % 2 == 1] # filter out odd values
Out[69]:
In [71]:
# Sorting by value:
myseries.sort_values()
Out[71]:
A useful feature about Series
is that the operations that can act on it are often smart enough to combine values by index. For example, when adding to Series
together, the values of the same indices (=keys) will automatically be summed together, like so:
In [13]:
a_dict = {'Netherlands': 5, 'Belgium': 10, 'France': 21, 'UK': 5}
b_dict = {'Netherlands': 2, 'Belgium': 1, 'France': -9, 'USA': 4}
a = Series(a_dict)
b = Series(b_dict)
a+b
Out[13]:
This is not supported in python by default:
In [14]:
a_dict + b_dict # Doesn't work, TypeError
Where a Series
is like a single associative array, a DataFrame
is a rectangular table of data which contains an ordered collection of columns (i.e. Series
).
In [1]:
# python list of dictionaries that all have the same keys
customers = [
{'name': "John", 'last_name': "Smith", 'age': 43, 'customer_nr': 12345 },
{'name': "Mary", 'last_name': "Potter", 'age': 25, 'customer_nr': 67889 },
{'name': "Rose", 'last_name': "Harrison", 'age': 39, 'customer_nr': 23456 },
{'name': "John", 'last_name': "Ford", 'age': 56, 'customer_nr': 99999 },
{'name': "Patrick", 'last_name': "Harrison", 'age': 41, 'customer_nr': 7777 },
]
# print customers
In [2]:
# let's create a dataframe from this
import pandas as pd
customers_df = pd.DataFrame(customers)
print customers_df # As you'll notice, pandas does pretty printing!
In [15]:
import numpy as np
# customers_df.select(lambda x: x.name == "John")
customers_df[customers_df['name'] == "John"]
Out[15]:
In [18]:
# Alternative way to define a the same thing
customers = {
'name': ["John", "Mary", "Rose", "John", "Patrick"],
'last_name': ["Smith", "Potter", "Harrison", "Ford", "Harrison"],
'age': [43,25,39,56,41],
'customer_nr': [12345, 67889, 23456,99999,7777]
}
customers_df = pd.DataFrame(customers)
print customers_df
In [47]:
# By default, rows are labeled by numbers, but you can also give them labels, like so
population = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pd.DataFrame(population)
Out[47]:
Pandas then has a bunch of methods on the DataFrame that allows you to easily manipulate the data.
In [33]:
# accessing a certain column (this is a Series that get's returned), either by key or attribute name
print customers_df['name'], "\n\n", customers_df.name
In [36]:
# To access a certain row, use df.loc:
customers_df.loc[2] # Note that rows are also Series (you can tell from the output because it has a Name and dtype)
Out[36]:
In [72]:
# Transpose:
customers_df.T
Out[72]:
In [74]:
# Sort by a certain column:
customers_df.sort_values(by='age') # You can also sort by multiple columns by passing an array
Out[74]:
In [20]:
# count occurences
customers_df['name'].value_counts() # there are 2 johns!
Out[20]:
In [78]:
# sum() sums that columns that have a dtype that can be summed.
# describe() provides statistical data for the columns with a dtype for which it can be done
# There's a lot of more of these convenience functions like mean, median, prod, std, var, min/max, etc.
print customers_df.sum(), "\n\n", customers_df.describe()
In [40]:
# Add a new column and assign a value for existing column
import datetime
customers_df['signup_date'] = datetime.datetime.now()
customers_df ['customer_nr'] = 1234 # Set all values to 1234, you can also pass an array to specify each value
customers_df
Out[40]:
In [41]:
# Boolean masks: select all customers that are older than 40
# Note the use of 3
customers_df[customers_df['age'] > 40]
Out[41]:
In [42]:
# you can plot counts (do matplotlib plots inline)
%matplotlib inline
name_counts = customers_df['name'].value_counts()
name_counts.plot(kind='barh') # barh = bar horizontal
Out[42]:
In [54]:
# Let's define a dataframe
numbers = pd.DataFrame({
'elevation': [123, -23, 456],
'signal_strength': [-783, 123, 453]
})
numbers
Out[54]:
In [53]:
# Since a dataframe is just a Series of Series, you can apply numpy functions to it, like so:
import numpy as np
np.abs(numbers)
Out[53]:
In [57]:
# Apply a custom function to a column (=series at a time)
numbers.apply(lambda col_series: col_series + 100)
Out[57]:
In [79]:
# Apply a custom function to an element at a time
numbers.applymap(lambda val: 0 if val < 100 else val)
Out[79]: