In [1]:
import pandas as pd
In [2]:
## difference between loc and iloc
vals = [0, 1, 2]
idx = [10, 11, 12]
ser = pd.Series(vals, index=idx)
print("...using loc")
print(ser.loc[10:11])
print("\n...using iloc")
print(ser.iloc[0:2])
In [3]:
## creating series
vals = [0, 1, 2, 3]
idx = ['a', 'b', 'c', 'd']
ser = pd.Series(vals, index=idx)
print(ser)
print("\n...single index")
print(ser.loc['a'])
print("\n...a slice")
print(ser.loc['a':'c'])
In [4]:
## from a list of lists
vals = [[1,2,3], [4,5,6]]
cols = ['a', 'b', 'c']
df = pd.DataFrame(data=vals, columns=cols)
df
Out[4]:
In [5]:
## from a list of dictionaried
rows = [{'a': 1, 'b': 2, 'c':3}, {'a': 4, 'b': 5, 'c':6}]
df = pd.DataFrame(rows)
df
Out[5]:
In [6]:
## how does pandas deal with missing numeric values
vals = [[10, 11], [20, 21, 22]]
cols = ['a', 'c', 'd']
df = pd.DataFrame(data=vals, columns=cols)
df
Out[6]:
In [7]:
## how does pandas deal with missing string values
vals = [['z', 'y'], ['x', 'w', 'v']]
cols = ['a', 'c', 'd']
df = pd.DataFrame(data=vals, columns=cols)
df
Out[7]:
In [8]:
df.transpose()
Out[8]:
In [9]:
df
Out[9]:
In [10]:
## dealing with missing data
vals = [[10, 11], [20, 21, 22]]
cols = ['a', 'c', 'd']
df1 = pd.DataFrame(data=vals, columns=cols)
df1.dropna(inplace=True)
df1
Out[10]:
In [11]:
## fill them in with something
df2 = pd.DataFrame(data=vals, columns=cols)
df2.fillna(-1, inplace=True)
df2
Out[11]:
In [ ]:
In [12]:
df = pd.read_csv('winequality-red.csv', delimiter=';')
df.head()
Out[12]:
In [13]:
import pprint
print("Shape of my data frame: {} x {}".format(df.shape[0],df.shape[1]))
df.describe()
Out[13]:
In [12]:
import re
## clean up the column names
cols = df.columns.tolist()
df.columns = [re.sub("\s+","_",col) for col in df.columns.tolist()]
In [3]:
## print the columns description
print("Shape of my data frame: {} x {}".format(df.shape[0],df.shape[1]))
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(list(df.columns))
In [14]:
## basic statistical calculations
df.describe()
Out[14]:
In [15]:
## single column reference returns a slice
a_column = df['fixed_acidity']
print(type(a_column))
## multiple column references returns another dataframe
some_columns = df[['fixed_acidity','citric_acid']]
print(type(some_columns))
In [16]:
## get rows from a given column using loc
df.loc[100:102, 'fixed_acidity']
Out[16]:
In [17]:
## get rows from a given column using iloc
df['fixed_acidity'].iloc[100:102]
Out[17]:
In [18]:
## get rows from a multiple columns using iloc
df[['fixed_acidity','chlorides']].iloc[100:102]
Out[18]:
In [19]:
df_quality = df.groupby("quality")
df_quality.mean()
Out[19]:
In [20]:
mask = df['alcohol'] > 12.0
df1 = df[mask]
df1.groupby("quality").mean()
Out[20]:
In [21]:
## multiple conditionals
df[(df['alcohol'] > 12.0) & (df['quality'] > 5)].groupby("quality").mean()
Out[21]:
In [22]:
## Can also use query to evaluate conditionals
result_df = df.query('alcohol >= 9.10 and pH < 3.5')
result_df.loc[1:6, ['alcohol', 'pH']]
Out[22]:
In [23]:
## single column sort
df.sort_values(by='fixed_acidity', ascending=True)
Out[23]:
In [25]:
## multicolumn sort
df.sort_values(by=['fixed_acidity', 'volatile_acidity'], ascending=True)
Out[25]:
In [ ]:
In [26]:
df.drop('sulphates', axis=1, inplace=True)
df.info()
In [27]:
## Using eval
df.eval('total_acidity = volatile_acidity + fixed_acidity', inplace=True)
df[['total_acidity', 'volatile_acidity', 'fixed_acidity']].head()
Out[27]:
In [ ]:
In [ ]: