In [1]:
#importing pandas package
import pandas as pd
path_to_projectData = 'Data/' #input path where the data of the project resides
bot_data = pd.read_csv(path_to_projectData + 'bots_data.csv',encoding='latin-1') #reading the bots_data.csv file
bot_data.head(10)
nonbots_data = pd.read_csv(path_to_projectData + 'nonbots_data.csv',encoding="latin-1") #reading the nonbots_data.csv file
nonbots_data.head(10)

complete_data= bot_data.append(nonbots_data) # Combining both the bot and non-bots data set
complete_data.head(10)

#SELECTING TOP 10 ROWS FROM THE COMBINED DATASET


Out[1]:
id id_str screen_name location description url followers_count friends_count listedcount created_at favourites_count verified statuses_count lang status default_profile default_profile_image has_extended_profile name bot
0 3.982732e+09 3982731976 mcgucket_bot NaN A bot that tweets every line said by Fiddlefor... NaN 1129 7 2 Sat Oct 17 22:35:31 +0000 2015 0 False 23557 en {'retweeted': False, 'is_quote_status': False,... False False False McGucket bot 1
1 8.410000e+17 8.41E+17 BowieK66 NaN NaN NaN 0 22 0 13/03/2017 22:21 0 False 1 en Status(_api=<tweepy.api.API object at 0x101927... True True False MJ Kuhn 1
2 2.768375e+09 2768374981 ducknoteprice NaN NaN NaN 3 0 3 Mon Aug 25 23:17:59 +0000 2014 0 False 1050 en {'created_at': 'Fri Feb 26 17:01:15 +0000 2016... True False False duckNote 1
3 3.304189e+09 3304189373 robotrecipes robot kitchen tasty recipes for robot // not for human // a ... http://t.co/PdagJGqVMR 505 13 49 Sat May 30 23:42:16 +0000 2015 0 False 5109 en {u'contributors': None, u'truncated': False, u... False False False robot recipes 1
4 3.224289e+09 3224289024 everyumlaut NaN bot by @dbaker_h NaN 15 0 11 Sat May 23 14:39:21 +0000 2015 0 False 31365 en-gb {u'contributors': None, u'truncated': False, u... True False False ÌÇvÌÇrÌÀ Ì_mlÌ_Ì_‡¼Ñ 1
5 7.300000e+17 7.30E+17 glossatory Australah SOCIAL ON GLOSSATORY: the branch of the consid... https://t.co/hcUNcUdlcq 16 1 4 5/11/2016 5:45 0 False 1832 en Status(in_reply_to_user_id=None, favorited=Fal... False False False GLOSSATORY 1
6 8.200000e+17 8.20E+17 Fancypants6047 vancouver NaN NaN 41 394 0 13/01/2017 19:37 3 False 4 en Status(_api=<tweepy.api.API object at 0x101927... True False True Fancypants 1
7 2.602217e+09 2602217174 Hedgehogize Moebius @Hedgehogize me or @Hedgehogize NAME to unlock... http://t.co/a1UvEiAl3u 549 370 12 Thu Jul 03 20:37:48 +0000 2014 12 False 4658 en {u'contributors': None, u'truncated': False, u... False False False YOU the Hedgehog 1
8 8.330000e+17 8.33E+17 jamieph93986621 NaN I wasn't bor ysterday, was born tomorrow NaN 0 60 0 Sun Feb 19 03:47:42 +0000 2017 0 False 10 en {"created_at": "Mon Mar 13 02:55:10 +0000 2017... True False False Jamie Phillips 1
9 8.250000e+17 8.25E+17 NothemDonella NaN Your diac No NaN 0 43 4 Fri Jan 27 15:51:34 +0000 2017 48 False 86 en {"created_at": "Mon Mar 13 02:52:13 +0000 2017... True False False Donella Nothem 1

In [2]:
#import train_test_split,Imputer from sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

# selecting attributes which have numeric values and categorical values
X= complete_data.loc[:,['followers_count','friends_count','listedcount','favourites_count','verified','statuses_count',
              'default_profile','default_profile_image','has_extended_profile']]

X = Imputer().fit_transform(X) # filling the NaN and infinite values

Y= complete_data['bot']

#Splitting the dataset into test and train data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42) 


#Importing the DecisionTreeClassifier and predicting whether the profile is a bot or not using it
from sklearn.tree import DecisionTreeClassifier
dtree= DecisionTreeClassifier()
dtree.fit(X_train,Y_train)
dtree_predictions=dtree.predict(X_test)

#Importing the classification_report,confusion_matrix from sklearn.metrics package
from sklearn.metrics import classification_report,confusion_matrix

#Printing the Confusion Matrix and Classification Report for DecisionTreeClassifier
print(confusion_matrix(Y_test,dtree_predictions))
print('\n')
print(classification_report(Y_test,dtree_predictions))


[[197  25]
 [ 46 179]]


             precision    recall  f1-score   support

          0       0.81      0.89      0.85       222
          1       0.88      0.80      0.83       225

avg / total       0.84      0.84      0.84       447


In [3]:
#Importing the RandomForestClassifier and predicting whether the profile is a bot or not using it
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,Y_train)
rfc_predictions=rfc.predict(X_test)

#Printing the Confusion Matrix and Classification Report for RandomForestClassifier
print(confusion_matrix(Y_test,rfc_predictions))
print('\n')
print(classification_report(Y_test,rfc_predictions))


[[209  13]
 [ 42 183]]


             precision    recall  f1-score   support

          0       0.83      0.94      0.88       222
          1       0.93      0.81      0.87       225

avg / total       0.88      0.88      0.88       447