In [1]:
#importing pandas package
import pandas as pd
path_to_projectData = 'Data/' #input path where the data of the project resides
bot_data = pd.read_csv(path_to_projectData + 'bots_data.csv',encoding='latin-1') #reading the bots_data.csv file
bot_data.head(10)
nonbots_data = pd.read_csv(path_to_projectData + 'nonbots_data.csv',encoding="latin-1") #reading the nonbots_data.csv file
nonbots_data.head(10)
complete_data= bot_data.append(nonbots_data) # Combining both the bot and non-bots data set
complete_data.head(10)
#SELECTING TOP 10 ROWS FROM THE COMBINED DATASET
Out[1]:
In [2]:
#import train_test_split,Imputer from sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
# selecting attributes which have numeric values and categorical values
X= complete_data.loc[:,['followers_count','friends_count','listedcount','favourites_count','verified','statuses_count',
'default_profile','default_profile_image','has_extended_profile']]
X = Imputer().fit_transform(X) # filling the NaN and infinite values
Y= complete_data['bot']
#Splitting the dataset into test and train data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
#Importing the DecisionTreeClassifier and predicting whether the profile is a bot or not using it
from sklearn.tree import DecisionTreeClassifier
dtree= DecisionTreeClassifier()
dtree.fit(X_train,Y_train)
dtree_predictions=dtree.predict(X_test)
#Importing the classification_report,confusion_matrix from sklearn.metrics package
from sklearn.metrics import classification_report,confusion_matrix
#Printing the Confusion Matrix and Classification Report for DecisionTreeClassifier
print(confusion_matrix(Y_test,dtree_predictions))
print('\n')
print(classification_report(Y_test,dtree_predictions))
In [3]:
#Importing the RandomForestClassifier and predicting whether the profile is a bot or not using it
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,Y_train)
rfc_predictions=rfc.predict(X_test)
#Printing the Confusion Matrix and Classification Report for RandomForestClassifier
print(confusion_matrix(Y_test,rfc_predictions))
print('\n')
print(classification_report(Y_test,rfc_predictions))