In [6]:
from pandas import read_csv
import os
import pandas as pd
In [33]:
os.chdir('C:/Users/arielxxd/Documents/GitHub/py_for_engineers') # access to the file directory
sample_emission_df = read_csv('sample.csv', sep = ',') # load data as data frame
print sample_emission_df.head() # print top 5 rows
In [34]:
# Query data from dataframe, check the difference between iloc and loc here: https://pandas.pydata.org/pandas-docs/stable/indexing.html
print sample_emission_df.loc[1]
In [35]:
# select dataframe based on column value
print sample_emission_df.loc[sample_emission_df['OpModeID']==0]
In [36]:
# select one column based on another column value
print sample_emission_df.loc[sample_emission_df['OpModeID']==0, 'emissionQuant']
In [37]:
# select dataframe if column values fall into a set
target_opmode = [0, 1, 11]
print sample_emission_df.loc[sample_emission_df['OpModeID'].isin(target_opmode)]
In [38]:
# select dataframe if column values NOT fall into a set
target_opmode = [0, 1, 11]
print sample_emission_df.loc[~sample_emission_df['OpModeID'].isin(target_opmode)].head()
In [39]:
# for most of time, you can manipulate the dataframe without a loop, let's see how we can do that
sample_emission_df['rate'] = sample_emission_df['emissionQuant'] / 3600.0
print sample_emission_df.head()
In [40]:
# if you want to apply a function to each row, try this:
def try_this_function(x):
y = x/3600
return y
sample_emission_df['rate_2'] = sample_emission_df.apply(lambda row: try_this_function(row['emissionQuant']), axis=1)
print sample_emission_df.head()
In [41]:
# under rare conditions, you really need a loop, then you can try this:
sample_emission_df['new_rate'] = None
for index, rate in enumerate(sample_emission_df['emissionQuant']):
sample_emission_df.loc[index, 'new_rate'] = rate / 3600.0
print sample_emission_df.head()
In [42]:
# if you only want to change selected values, then try this
whatever_factor = 1.5
print "Before: "
print sample_emission_df.loc[sample_emission_df['OpModeID']==1, 'emissionQuant']
sample_emission_df.loc[sample_emission_df['OpModeID']==1, 'emissionQuant'] *= whatever_factor
print "After: "
print sample_emission_df.loc[sample_emission_df['OpModeID']==1, 'emissionQuant']
In [57]:
# processing category data? try groupby!
group_result = sample_emission_df.groupby(['modelYearID','fuelTypeID'])['emissionQuant'].agg(['mean', 'count'])
print group_result
In [58]:
#want some descriptive stats? lets try this!
print sample_emission_df.describe()
In [60]:
# pandas pivot table
out_emission_df = pd.pivot_table(sample_emission_df, index = 'OpModeID', columns = 'fuelTypeID', values = 'emissionQuant')
print out_emission_df
In [66]:
# some data visualization
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
out_emission_df.plot(kind="barh")
plt.show()
In [68]:
#don't worry about the layout! try this
import seaborn as sns
plt.style.use('fivethirtyeight')
out_emission_df.plot(kind="barh")
plt.show()
In [72]:
# want some decoration, here you are:
out_emission_df.plot(kind="barh")
plt.legend(['Diesel','CNG'])
plt.xlabel('Emission rate (gram/hour)', fontsize = 14)
plt.ylabel('Operating mode bin',fontsize=14)
plt.show()
In [89]:
#change plot face color
plt.rcParams['axes.facecolor'] = 'white'
out_emission_df.plot(kind="barh")
plt.legend(['Diesel','CNG'], fontsize = 14)
plt.xlabel('Emission rate (gram/hour)', fontsize = 14)
plt.ylabel('Operating mode bin',fontsize=14)
plt.show()
In [94]:
# split plots? sure!
ax = out_emission_df.plot(kind='barh',subplots=True, layout=(1, 2), figsize=(8, 8), sharex=False)
# change label for one plot
ax[0][0].set_xlabel('emission rate')
ax[0][0].legend('d')
plt.tight_layout()
plt.show()
In [95]:
# plot error bar, assume error = 0.05 * value
error = out_emission_df * 0.05
print error
In [110]:
out_emission_df.plot.bar(yerr=error, error_kw=dict(lw=1, capsize=2, capthick=1))
plt.legend(['Diesel','CNG'], fontsize = 14)
plt.xlabel('Emission rate (gram/hour)', fontsize = 14)
plt.ylabel('Operating mode bin',fontsize=14)
plt.xticks(rotation=0)
plt.show()
In [ ]: