In [1]:
import pandas as pd
import numpy as np
In [2]:
# Create a Series with auto-generated indices
pd.Series(data=[100, 101, 110, 111], dtype=np.int8)
Out[2]:
In [3]:
# Create a Series with custom indices
pd.Series(data=[100, 101, 110, 111], index=['a', 'b', 'c', 'd'], dtype=np.int8)
Out[3]:
In [4]:
# Create a Series using a dictionary
d = {'a' : 100, 'b': 101, 'c': 110, 'd': 111}
pd.Series(data=d, dtype=np.int8)
Out[4]:
In [5]:
day1 = pd.Series(data=[400, 600, 400], index=['breakfast', 'lunch', 'dinner'], dtype=np.int16)
day1
Out[5]:
In [6]:
day2 = pd.Series(data=[350, 500, 150], index=['breakfast', 'lunch', 'snack'], dtype=np.int16)
day2
Out[6]:
In [7]:
# Note that only values of matched indices are added together.
day1 + day2
Out[7]:
In [8]:
def init_df():
return pd.DataFrame(data=np.arange(1,17).reshape(4,4), index='w x y z'.split(), columns='A B C D'.split())
df = init_df()
df
Out[8]:
In [9]:
# Create a new column based on another column
df['E'] = df['A'] ** 2
df
Out[9]:
In [10]:
# Create a new DataFrame, where certain columns are excluded.
df.drop(['A', 'E'], axis=1)
Out[10]:
In [11]:
# Remove a column permanently
df.drop('E', axis=1, inplace=True)
df
Out[11]:
In [12]:
# Select column 'A'
df['A']
Out[12]:
In [13]:
# Note that all columns are stored as Series objects
type(df['A'])
Out[13]:
In [14]:
# Selecting multiple columns, we get a new DataFrame object
df[['A', 'D']]
Out[14]:
In [15]:
# Select a row by its label
df.loc['x']
Out[15]:
In [16]:
# Select a row by its numerical index position
df.iloc[0]
Out[16]:
In [17]:
# Select the value of the first cell
df.loc['w', 'A']
Out[17]:
In [18]:
# Select a subset of the DataFrame
df.loc[['x', 'y'], ['B', 'C']]
Out[18]:
In [19]:
# Conditional selection
df[df > 10]
Out[19]:
In [20]:
# Note that the conditional selection only
# returns cells whose boolean value is True
# in the following DataFrame
df > 10
Out[20]:
In [21]:
# Select the rows where column A is larger or equal to 9
df[df['A'] >= 9]
Out[21]:
In [22]:
# Note that we use `&` as conjunction since Python's `and` operator
# can only deal with single Boolean values e.g. `True and True`
df[(df['A'] >= 9) & (df['C'] == 11)]
Out[22]:
In [23]:
df[(df['A'] >= 9) | (df['C'] == 3)]
Out[23]:
In [24]:
# Reset the index to a numerical value
# Note that the old index will become
# a column in our DataFrame.
df.reset_index()
Out[24]:
In [25]:
# Set a new index.
df['Country'] = 'CA DE DK NO'.split()
df.set_index('Country')
# To overrides the old index use following line instead:
# df.set_index('Country', inplace=True)
Out[25]:
In [26]:
outside = 'p p p q q q'.split()
inside = [1, 2, 3, 1, 2, 3]
hierarchical_index = list(zip(outside, inside))
multi_index = pd.MultiIndex.from_tuples(hierarchical_index, names='outside inside'.split())
multi_index
Out[26]:
In [27]:
df = pd.DataFrame(data=np.random.randn(6,2), index=multi_index, columns=['Column 1', 'Column 2'])
df
Out[27]:
In [28]:
# Select using the outer index
df.loc['p']
Out[28]:
In [29]:
# Select using the inside index
df.loc['p'].loc[2]
Out[29]:
In [30]:
# Select a specific cell
df.loc['p'].loc[2]['Column 1']
Out[30]:
In [31]:
# Rename index names
df.index.names = ['O', 'I']
df
Out[31]:
Cross section is used when we need to select data at a particular level.
In [32]:
# Select rows whose inside index is equal 1
df.xs(1, level='I')
Out[32]:
In [33]:
d = {'A': [1, 2, np.nan], 'B': [1, np.nan, np.nan], 'C': [1, 2, 3]}
df = pd.DataFrame(d)
df
Out[33]:
In [34]:
# Drop any rows with missing values
df.dropna()
Out[34]:
In [35]:
# Keep only the rows with at least 2 non-na values:
df.dropna(thresh=2)
Out[35]:
The subset parameter can be used to specify which columns an action should apply to instead of all columns. For instance, if we want to drop rows with missing values, subset specifies a list of columns to include.
For instance, df.dropna(thresh=1, subset=['A','B']) will drop all rows with less than 1 NA value in only columns A and B(rather than all the columns to consider for thresh=1).
The line df.dropna(how=all, subset=['A','B']) will drop all rows with all NA values in only columns A and B.
In [36]:
# Drop any columns with missing values
df.dropna(axis=1)
Out[36]:
In [37]:
# Replace missing values
df.fillna(0)
Out[37]:
In [38]:
# Replace missing values with the mean of the column
df['A'].fillna(value=df['A'].mean())
Out[38]:
In [39]:
columns = 'Id EmployeeName JobTitle TotalPay Year'.split()
salaries_df = pd.read_csv('data/sf-salaries-subset.csv', index_col='Id', usecols=columns)
salaries_df.head()
Out[39]:
In [40]:
# Group by job title
salaries_by_job_df = salaries_df.groupby('JobTitle')
In [41]:
# Get some statistics on the TotalPay column
salaries_by_job_df['TotalPay'].describe()
Out[41]:
In [42]:
# Get some statistics on all numeric columns
salaries_by_job_df.describe()
Out[42]:
In [43]:
# Present statistics in a different way
salaries_by_job_df.describe().transpose()
Out[43]:
In [44]:
# Count number of rows in each group
salaries_by_job_df.count()
Out[44]:
In [45]:
# Find the mean of numeric columns
salaries_by_job_df.mean()
Out[45]:
In [46]:
# Get the highest pay
salaries_df['TotalPay'].max()
Out[46]:
In [47]:
# Get the position of the highest pay
salaries_df['TotalPay'].argmax()
Out[47]:
In [48]:
# Get the person with the highest pay
salaries_df.iloc[salaries_df['TotalPay'].argmax()]
Out[48]:
In [49]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
In [50]:
# Combine along the rows
pd.concat([df1, df2, df3])
Out[50]:
In [51]:
# Combine along the columns
# Note that Pandas assigns cell values that does not align correct to NaN
pd.concat([df1, df2, df3], axis=1)
Out[51]:
The merge function is useful if we want to combine DataFrames like we join tables using SQL.
In [52]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
In [53]:
pd.merge(left, right, how='inner', on=['key1', 'key2'])
Out[53]:
The join function is used to combine the columns of DataFrames that may have different indices. It works exactly like the merge function except the keys that we join on are on the indices instead of the columns.
In [54]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
In [55]:
left.join(right)
Out[55]:
In [56]:
left.join(right, how='outer')
Out[56]:
In [57]:
df = pd.DataFrame({'col1':[1,2,3,4],
'col2':[444,555,666,444],
'col3':['abc','def','ghi','xyz']})
df.head()
Out[57]:
In [58]:
# Find the unique values in col2
df['col2'].unique()
Out[58]:
In [59]:
# Find the number of unique values in col2
df['col2'].nunique()
Out[59]:
In [60]:
# Find the unique values in col2
df['col2'].value_counts()
Out[60]:
In [61]:
# The value_counts() can be used to find top X row most common value
df['col2'].value_counts().head(1)
Out[61]:
In [62]:
# Apply custom function to each element of a column
df['col1'].apply(lambda element_value: element_value**2)
Out[62]:
In [63]:
# Find the names of all the columns in the DataFrame
df.columns
Out[63]:
In [64]:
# Sort data
df.sort_values(by='col2')
Out[64]:
In [65]:
# Find null values
df.isnull()
Out[65]:
In [66]:
data = pd.read_html('https://borsen.dk/kurser/danske_aktier/c20_cap.html', thousands='.', decimal=',')
df = data[0]
In [67]:
# Show information about the data
df.info()
In [68]:
df.columns
Out[68]:
In [69]:
df.columns = ['Akie', '%', '+/-', 'Kurs', 'ATD%', 'Bud', 'Udbud', 'Omsætning']
In [70]:
df.info()
In [71]:
df['Omsætning'][0]
Out[71]:
In [ ]: