In [ ]:
#### Importing Data ####
import pandas as pd
column_names_if_not_present_with_data = ['name1','name2',...]
dataset = pd.read_csv("link / file with extension in same working directory", names = column_names_if_not_present_with_data,sep = ",")
#sep = specifies the seperator used to seperate values of 2 different columns - ","(def.), "/t"(for space) , ";",etc
dataset = pd.read_csv("link / file with extension in same working directory", header = None)
#if there are too many columns whose name we cannot specify ,then pandas itself specify their column names as 0,1,2,... iff header = None
pd.read_fwf("file with extension") - for fixed width files
pd.read_excel("excel file with .xls")
pd.read_clipboard("clipboard file with extension")
In [ ]:
## Naive Exploration
data.shape - dimension (m rows,n col)
data.info() - basic info abt data
data.describe() - statistical summary of data
data.head() - 1st 5 enteries
data.tail() - last 5 enteries
data.sample(6) - random 6 enteries (int value can be changed)
## Data Accessing
'col_name_if_its_a_string' and col_name_if_its_int_or_float
data['col_name_if_its_a_string'] ,data[col_name_if_its_int_or_float] - accessing specific cols
data[['col_name1','col_name2','col_name3',..]] -customised col selection
data.iloc[:,col_index_strt:col_index_end+1:step_size] - sequential col selection
data[row_index_strt:row_index_end+1:step_size] - sequential row selection
data = pd.DataFrame(container) - to convert into pandas dataframe
type(container_or_its_variable) - to know datatype of any container we want to know
container - int,foat,string,bool,pandas dataframe, pandas series, numpy arrays
In [1]:
import pandas as pd
from io import StringIO
csv_data ='''A,B,C,D
1.,2.,3.,4.
5.,6.,,8.
0.,11.,12,'''
df = pd.read_csv(StringIO(csv_data))
In [5]:
#### Missing Values ####
#Determining NaN (missing) values in each column
df.isnull().sum()
Out[5]:
In [87]:
### Eliminating full column / rows containing Missing Values
##To drop rows
df = df.dropna() #temp action, needed to be saved - mostly used
##To drop columns
df = df.dropna(axis=1)
##Other parameters of dropna()
#thresh = int value - atleast thresh NaN values
Out[87]:
In [ ]:
###Imputing Missing Values
from sklearn.preprocessing import Imputer as imp
imr =imp(missing_values ='NaN',strategy = 'mean',axis =0) #axis =0 for column computation (preferred)
#Other strategy = 'median'/'most_frequent' #axis =1 for row computation
imputed_data = imr.fit(df).transform(df)
col_names = df.columns
df = pd.DataFrame(imputed_data)
df.columns = col_names
In [80]:
import pandas as pd
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],['red', 'L', 13.5, 'class2'],['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df
Out[80]:
In [83]:
#### Categorical Data ####
###To find unique values in specific column
df['classlabel'].unique()
### Encoding Output Column
from sklearn import preprocessing as pre
label_encode = pre.LabelEncoder()
label_encode.fit(df['classlabel'])
encoded_label = label_encode.transform(df['classlabel'])
df = df.iloc[:,:3]
df['encoded_class'] = encoded_label
df
## Decoding Output Column
decoded_labels = label_encode.inverse_transform(encoded_label)
Out[83]:
In [84]:
### Creating Dummy Features for Input Columns
pd.get_dummies(df) #temp
Out[84]:
In [43]:
#### Partioning Dataset into Train & Test ####
#In ML Competitions X,y are seperately given
import pandas as pd
col_names = ['Class label', 'Alcohol','Malic acid', 'Ash','Alcalinity of ash', 'Magnesium','Total phenols', 'Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity', 'Hue','OD280/OD315 of diluted wines','Proline']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', names =col_names)
from sklearn.model_selection import train_test_split as tts
X,y = data.iloc[:,1:],data['Class label']
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.25,random_state =42)
In [41]:
#### Feature Scaling ####
### Standardization - mostly used
#Data Centered around Zero with Std. Deviation =1
from sklearn.preprocessing import StandardScaler as ss
std_scaler = ss()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
X_train = pd.DataFrame(X_train)
X_train.columns = data.columns[1:] #it contains list of column names of dataframe
X_test = pd.DataFrame(X_test)
X_test.columns = X_train.columns
### Normalization - can be tried
#Data ranges from 0-1
from sklearn.preprocessing import MinMaxScaler as mms
norm_scaler = mms()
X_train = norm_scaler.fit_transform(X_train)
X_test = norm_scaler.transform(X_test)
X_train = pd.DataFrame(X_train)
X_train.columns = data.columns[1:]
X_test = pd.DataFrame(X_test)
X_test.columns = X_train.columns
Out[41]:
In [45]:
#### Avoiding Overfitting ####
# when, Performace on Train Data is >>> Test Data (no generalization)
### Feature Importance via Random Forests
import numpy as np
from sklearn.ensemble import RandomForestClassifier as rfc
feat_labels = data.columns[1:]
forest = rfc(n_estimators=10000,random_state=0,n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print(("%2d) %-*s %f" % (f + 1, 30,feat_labels[f],importances[indices[f]])))
import matplotlib.pyplot as plt
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(X_train.shape[1]),feat_labels, rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()
# We can now consider only 3/4 most important columns into consideration
X_train_new = X_train[['Alcohol','Malic acid','Ash']]
X_train_new.head()
Out[45]:
In [60]:
#### Dimensionality Reduction ####
### PCA -Unsupervided Learning
from sklearn.decomposition import PCA
pca = PCA(n_components=5) #n_components decides the resultant dimension of dataset
X_train_pca = pca.fit_transform(X_train)
X_train =pd.DataFrame(X_train_pca)
# X_train reduced to 5 columns
### LDA -Supervised Learning
from sklearn.lda import LDA
lda = LDA()
X_train_lda = lda.fit_transform(X_train, y_train)
X_train =pd.DataFrame(X_train_lda)
# X_train reduced to 2 columns (always)
### Kernel PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=4,kernel='rbf', gamma=15) #n_components decides the resultant dimension of dataset
X_train_kpca = kpca.fit_transform(X_train)
X_train =pd.DataFrame(X_train_kpca)
In [183]:
######################## EXAMPLE ##########################
## Importing Data
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',header = None)
In [184]:
## Encoding Output Column
from sklearn import preprocessing as pre
label_encode = pre.LabelEncoder()
label_encode.fit(df[1])
encoded_label = label_encode.transform(df[1])
In [196]:
data = data.drop(1)
# Drop column - data =data.drop('col_name',axis =1) | data = data.drop(col_name_if_int)
#Drop multiple columns - data= data.drop(['col_name1','col_name2',..],axis =1)
data1 = df.iloc[:,2:]
data2 = df.iloc[:,0] #selecting particular column
data =data1.join(data2) #to join 2 dataframes
data['class'] = encoded_label
data.head()
Out[196]:
In [195]:
## Partitioning Data
from sklearn.model_selection import train_test_split as tts
X,y = data.iloc[:,:32],data['class']
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.25,random_state =42)
X_train.head()
Out[195]:
In [ ]:
__TADA__