In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# ---- Summary of the twitter accounts -----#
# RER_A
# RERB
# RERC_SNCF --< Infotrafic
# RERD_SNCF --< Infotrafic
# RERE_SNCF --< Infotrafic
# Ligne12_RATP --< from 1 to 14 lines
line_list = ['RER_A', 'RER_B', 'RER_C', 'RER_D', 'RER_E','Ligne1_RATP', 'Ligne2_RATP', 'Ligne3_RATP', 'Ligne4_RATP',
'Ligne5_RATP', 'Ligne6_RATP', 'Ligne7_RATP', 'Ligne8_RATP', 'Ligne9_RATP', 'Ligne10_RATP', 'Ligne11_RATP', 'Ligne12_RATP',
'Ligne13_RATP', 'Ligne14_RATP' ]
file_path = "data/"
line_dict = dict()
for item in line_list:
line_dict[item] = pd.read_csv(file_path + item +'.csv', sep=';',error_bad_lines=False)
In [2]:
line_dict['RER_A'].sort_values(by='retweets',ascending = False).head()
Out[2]:
In [3]:
# Check how many items we have
for k,v in line_dict.items():
print(k, v.shape)
In [4]:
line_dict['Ligne11_RATP'].head()
Out[4]:
In [5]:
line_dict['RER_B'].head()
Out[5]:
In [6]:
# sort by retweets --> a dog is missing??? comes up first???
line_dict['RER_A'].sort_values(by='retweets',ascending = False)['text'].head()
Out[6]:
In [7]:
# find all station names --> gares
df_st = pd.read_csv('data\gares.csv', delimiter=';')
gares = df_st.nomptar.str.split('(')
gares = [x[0].rstrip(' ') for x in gares] # la defense has a lagging space
In [8]:
# change display to 200
## Step 1: delete all
## Théo, Bonjour, @, Tom, Emma
import re
def clean_data(input):
pd.options.display.max_colwidth = 200
input['date'] = pd.to_datetime(input.date)
input = input[input.date >= pd.to_datetime('2014-1-1')]
# replace pte, chateau
input.text = input.text.str.replace('Pte|pte', 'Porte')
input.text = input.text.str.replace('Chateau|chateau', 'Château')
input.text = input.text.str.replace('électr.', 'électrique')
input.text = input.text.str.replace('tvx.', 'travaux')
# in RER C, D, E, they published traffic information
# with a hashtag of "Infotrafic"
if re.search('RER[CDE]_SNCF',input.username.iloc[0]):
output = input[input.text.str.contains('Infotrafic', na=False)]
else:
# for all other lines,
# we drop the conservations data (see report for more details)
to_drop = ["Bonjour", "@",'Théo', 'Emma','Bjr','Inès',
'Lana','vous','soirée','Oui','estimée',
'Travaux prévus','journée','bonjour','rerb',
'rerc','rerd', 'rere','Infotrafic'] # all about conversations
output = input[~input.text.str.contains('|'.join(to_drop), na=False)]
return output
In [9]:
for k in line_dict.keys():
line_dict[k] = clean_data(line_dict[k])
print(k, line_dict[k].shape)
In [10]:
line_dict['RER_A'].sample(3).text
Out[10]:
In [11]:
# top 20 frequent words
import nltk
def words_freq(output):
moby_tokens = nltk.word_tokenize(output.text.str.lower().str.cat(sep = ' '))
text1 = nltk.Text(moby_tokens)
nltk.FreqDist(text1).most_common(20)
stopwords = nltk.corpus.stopwords.words('french')
stopwords = stopwords + ['rera','rerb','rerc','rerd','rere',
'ratp','ligne','entre',
'http','les','vers','dir','trafic','gare']
words_except_stop_dist = nltk.FreqDist(w for w in text1 if w not
in stopwords and w.isalpha() )
return words_except_stop_dist
In [12]:
from collections import Counter
def gare_fq(output):
gare_freq = Counter()
for gare in gares:
gare_freq[gare] = output.text.str.lower().str.contains(gare.lower()).sum()
return gare_freq
In [13]:
# sometimes, cergy-le-haut, naterre may be due to their direction -->
# result is true, many items are ignored entre XXX et XXX
line_dict['RER_A'].text[line_dict['RER_A'].text.str.contains('Cergy-Le-Haut')].sample(10)
Out[13]:
In [14]:
## Now let's try RER B
output_b = line_dict['RER_B']
words_freq(output_b).most_common(20)
Out[14]:
In [15]:
gare_fq(output_b).most_common(20)
Out[15]:
In [16]:
from collections import Counter
def incidient_reason(input):
output = input
incidents = ['malaise voyageur',"incident d'exploitation","incident technique",'Incident de signalisation',
"colis suspect", "voyageur malade", "incident voyageur",
"divers incidents",'panne de signalisation','panne de matériel',
'panne électrique','panne mécanique','panne de caténaire',
"panne d'aiguillage",'panne matériel','panne éléctrique',
'panne sur un train','pannes de signalisation',"panne d'un train",
"panne de train",'obstacle sur la voie', 'bagage abandonné','incident de passage',
'accident de personne','feu aux abords','pb signalisation','acte de malveillance',
'jets de pierre','obstacle sur la voie','bagage oublié',
'personnes sur les voies','branche tombée','jet de projectile']
incident_freq = Counter()
for incident in incidents:
incident_freq[incident] = output.text.str.lower().str.contains(incident.lower()).sum()
return incident_freq
In [17]:
incidient_reason(line_dict['RER_C']).most_common()
Out[17]:
In [18]:
# what if we write a summary function
def summary(input):
output = input
print()
print ('The 20 most frequent words are: ')
print(words_freq(output).most_common(20))
print('\n')
print('The 20 most frequent stations are: ')
print(gare_fq(output).most_common(20))
print('\n')
print('The 20 most frequent reasons are: ')
print(incidient_reason(output).most_common(20))
#summary(line_dict['RER_A'])
In [19]:
# concat all dataframe and clean data
def consol(data_dic):
result = pd.DataFrame()
for k, v in data_dic.items():
result = pd.concat([result, v])
result = result.sort_values(by='date')
return result
df_consol = consol(line_dict)
In [20]:
# overall tweets
df_consol.username.value_counts()
Out[20]:
In [21]:
date_tweets = df_consol.date.apply(lambda x: x.date()).value_counts()
date_tweets.iloc[:10]
Out[21]:
In [22]:
from matplotlib import pyplot as plt
%matplotlib inline
date_tweets.plot()
Out[22]:
In [23]:
# export date, username, tweets count
df_consol['date_new'] = df_consol.date.apply(lambda x: x.date())
df_consol.groupby(['date_new', 'username']).size().to_csv('output/tweets_date.csv')
df_consol.to_csv('output/consol.csv')
In [24]:
df_consol['hour'] = df_consol.date.apply(lambda x: x.hour)
df_consol.groupby(['hour','username']).size().to_csv('output/date_hour.csv')
df_consol.sort_values(by='retweets',ascending = False).head()
Out[24]:
In [25]:
df_incident = pd.DataFrame()
for k, v in line_dict.items():
print(k,'\n')
df_inter = pd.DataFrame.from_dict(incidient_reason(v).most_common())
df_inter['username'] = k
df_incident = pd.concat([df_incident, df_inter])
df_incident.sort_values(by=1, ascending = False).head()
Out[25]:
In [26]:
df_incident['group'] = df_incident.iloc[:,0]
rep = {'bagage oublié':'bagage abandonné', 'colis suspect':'bagage abandonné',
'voyageur malade':'malaise voyageur',
'pb signalisation':'panne de signalisation', 'jets de pierre':'acte de malveillance',
'jets de pierre':'acte de malveillance','jet de projectile':'acte de malveillance'}
df_incident.group = df_incident.group.replace(rep)
df_incident.loc[df_incident[0].str.contains('bagage', na=False)].head()
df_incident.to_csv('output/df_incident.csv')
In [27]:
df_incident.head()
Out[27]:
In [28]:
df_temp = pd.read_csv('data/temperature.csv')
df_temp.head()
fil = ['Date','T_avg','V_avg','W_avg', 'rain','fog','snow','Thunderstorms']
df_temp_fil = df_temp[fil]
df_temp_fil.head()
Out[28]:
In [29]:
# import which arrondissements are passed through by which line.
df_arr = pd.read_csv('data/data_arrondissement.csv')
df_arr.head()
Out[29]:
In [30]:
df_traffic = pd.read_csv('data/traffic-16.csv')
df_traffic.head()
Out[30]:
In [31]:
# build another conslidated dataframe, and forecast the reason
def df_class(input):
# list all reasons of incidents
incidents = ['malaise voyageur',"incident d'exploitation","incident technique",
'Incident de signalisation',
"colis suspect", "voyageur malade", "incident voyageur",
"divers incidents",'panne de signalisation','panne de matériel',
'panne électrique','panne mécanique','panne de caténaire',
"panne d'aiguillage",'panne matériel','panne éléctrique',
'panne sur un train','pannes de signalisation',"panne d'un train",
"panne de train",'obstacle sur la voie', 'bagage abandonné','incident de passage',
'accident de personne','feu aux abords','pb signalisation','acte de malveillance',
'jets de pierre','obstacle sur la voie','bagage oublié',
'personnes sur les voies','branche tombée','jet de projectile',
'grave de voyageur','animal sur la voie','défaut électrique',
'fin tardive de chantier',"Défaut d'alimentation électrique"]
# clean data
output = clean_data(input)
output = input[input.text.str.contains('|'.join(incidents),na=False)]
filt = "(malaise voyageur|incident d'exploitation|incident technique|Incident de signalisation|colis suspect|voyageur malade|incident voyageur|divers incidents|panne de signalisation|panne de matériel|panne électrique|panne mécanique|panne de caténaire|panne d'aiguillage|panne matériel|panne éléctrique|panne sur un train|pannes de signalisation|panne d'un train|panne de train|obstacle sur la voie|bagage abandonné|incident de passage|accident de personne|feu aux abords|pb signalisation|acte de malveillance|jets de pierre|obstacle sur la voie|bagage oublié|personnes sur les voies|branche tombée|jet de projectile|grave de voyageur|animal sur la voie|défaut électrique|fin tardive de chantier|Défaut d'alimentation électrique)"
output['reason'] = output.text.str.extract(filt)
filt2 = ['username','date_new','reason']
# extract incident reasons and create a new column of "reasons"
output = output[filt2]
# create quarter, month, year columns
output.date_new = pd.to_datetime(output.date_new)
df_temp_fil.Date = pd.to_datetime(df_temp_fil.Date)
#merge temperature data, arrondissements data and traffic data
output = output.merge(right=df_temp_fil, how='inner', left_on='date_new', right_on='Date')
output = output.merge(right=df_arr, how='inner', left_on='username', right_on='username')
output = output.merge(right=df_traffic, how='inner', left_on='username', right_on='username')
output['Quarter'] = output.date_new.apply(lambda x: pd.to_datetime(x).quarter)
output['Month'] = output.date_new.apply(lambda x: pd.to_datetime(x).day)
output['Year'] = output.date_new.apply(lambda x: pd.to_datetime(x).year)
output = output.drop(['date_new','Date'], axis=1)
# standardize all incident reasons
rep = {'bagage oublié':'bagage abandonné', 'colis suspect':'bagage abandonné',
'voyageur malade':'malaise voyageur', "Défaut d'alimentation électrique":'panne électrique',
"panne d'un train":'panne de train','grave de voyageur':'incident voyageur',
'Incident de signalisation':'pannes de signalisation',
'panne de matériel':'panne matériel',
'panne sur un train':'panne de train',
'pb signalisation':'panne de signalisation', 'jets de pierre':'acte de malveillance',
'jets de pierre':'acte de malveillance','jet de projectile':'acte de malveillance',
'accident de personne':'incident voyageur','malaise voyageur':'incident voyageur',
'pannes de signalisation':'panne de signalisation'}
output.reason = output.reason.replace(rep)
# some rows from df_temp_fil contains '-' items
output = output[output.T_avg != '-']
output = output[output.V_avg != '-']
return output
In [32]:
df_consol.head()
Out[32]:
In [33]:
df_class(df_consol).drop('reason', axis=1).sample(5)
Out[33]:
In [34]:
# let's run classification
from sklearn.model_selection import train_test_split
X = df_class(df_consol).drop('reason', axis=1)
In [35]:
X.head()
Out[35]:
In [36]:
# convert all data into numeric values and scale the data
X.T_avg = pd.to_numeric(X.T_avg)
X.V_avg = pd.to_numeric(X.V_avg)
X.W_avg = pd.to_numeric(X.W_avg)
X = pd.get_dummies(X)
y = df_class(df_consol).reason
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import auc, roc_auc_score, accuracy_score, f1_score
tree = DecisionTreeClassifier()
tree.fit(X_train_scaled, y_train)
y_pred = tree.predict(X_test_scaled)
acc_tree = accuracy_score(y_test, y_pred)
f1_tree = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy is {}'.format(accuracy_score(y_test, y_pred)))
print('F1 score is {}'.format(f1_score(y_test, y_pred, average = 'weighted')))
In [40]:
from sklearn.metrics import confusion_matrix
y_predicted = tree.predict(X_test_scaled)
confusion = confusion_matrix(y_test, y_predicted)
df_cm = pd.DataFrame(confusion)
#sns.set(font_scale=1.4)#for label size
plt.figure(figsize = (10,7))
sns.heatmap(df_cm)# font size
Out[40]:
In [41]:
# run knn
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
plt.figure()
scores = []
for n in range(1,50,20):
knn = KNeighborsClassifier(n_neighbors=n)
knn.fit(X_train_scaled, y_train)
scores.append(knn.score(X_test_scaled, y_test))
plt.plot(range(1,50,20), scores)
plt.title('KNN Accuracy curve')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
acc_knn = max(scores)
n_knn = list(range(1,50,20))[scores.index(max(scores))]
#
knn = KNeighborsClassifier(n_neighbors=n_knn)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
f1_knn = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy is {}'.format(acc_knn))
print('Best parameter is n = {}'.format(n_knn))
print('F1 score is {}'.format(f1_knn))
In [42]:
# performance --> confusion matrix
from sklearn.metrics import confusion_matrix
y_predicted = knn.predict(X_test_scaled)
confusion = confusion_matrix(y_test, y_predicted)
import seaborn as sns
import matplotlib.pyplot as plt
df_cm = pd.DataFrame(confusion)
#sns.set(font_scale=1.4)#for label size
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, cmap="YlGnBu")# font size
Out[42]:
In [43]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, roc_auc_score, accuracy_score, f1_score
scores = []
for n in range(1,200,20):
forest = RandomForestClassifier(n_estimators=n)
forest.fit(X_train_scaled, y_train)
y_pred = forest.predict(X_test_scaled)
scores.append(accuracy_score(y_test, y_pred))
#print(score)
plt.plot(range(1,200,20), scores)
plt.title('Random Forest accuracy curve')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy score')
plt.show()
acc_forest = max(scores)
n_forest = list(range(1,200,20))[scores.index(max(scores))]
#
forest = RandomForestClassifier(n_estimators=n_forest)
forest.fit(X_train_scaled, y_train)
y_pred = forest.predict(X_test_scaled)
f1_forest = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy is {}'.format(acc_forest))
print('Best parameter is n = {}'.format(n_forest))
print('F1 score is {}'.format(f1_forest))
In [44]:
from sklearn.metrics import confusion_matrix
y_predicted = forest.predict(X_test_scaled)
confusion = confusion_matrix(y_test, y_predicted)
df_cm = pd.DataFrame(confusion)
#sns.set(font_scale=1.4)#for label size
plt.figure(figsize = (10,7))
sns.heatmap(df_cm)# font size
Out[44]:
In [45]:
from sklearn.ensemble import AdaBoostClassifier
scores = []
for n in [1,200,500,1000]:
boost = AdaBoostClassifier(n_estimators=n)
boost.fit(X_train_scaled, y_train)
y_pred = boost.predict(X_test_scaled)
scores.append(accuracy_score(y_test, y_pred))
#print(score)
plt.plot([1,200,500,1000], scores)
plt.title('AdaBoost accuracy curve')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy score')
plt.show()
acc_boost = max(scores)
n_boost = list(range(1,200,20))[scores.index(max(scores))]
#
boost = AdaBoostClassifier(n_estimators=n_boost)
boost.fit(X_train_scaled, y_train)
y_pred = boost.predict(X_test_scaled)
f1_boost = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy is {}'.format(acc_boost))
print('Best parameter is n = {}'.format(n_boost))
print('F1 score is {}'.format(f1_boost))
In [46]:
from sklearn.svm import LinearSVC
import numpy as np
scores = []
rng = [1,10,50,70,100]
for c in rng:
l_svc = LinearSVC(C=c)
l_svc.fit(X_train_scaled, y_train)
y_pred = l_svc.predict(X_test_scaled)
scores.append(accuracy_score(y_test, y_pred))
#print(score)
plt.plot(rng, scores)
plt.title('Linear SVC')
plt.xlabel('C')
plt.ylabel('Accuracy score')
plt.show()
acc_svc = max(scores)
c_svc = rng[scores.index(max(scores))]
#
l_svc = LinearSVC(C=c_svc)
l_svc.fit(X_train_scaled, y_train)
y_pred = l_svc.predict(X_test_scaled)
f1_svc = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy is {}'.format(acc_svc))
print('Best parameter is c = {}'.format(c_svc))
print('F1 score is {}'.format(f1_svc))
In [47]:
l_svc = LinearSVC(C=c_svc)
l_svc.fit(X_train_scaled, y_train)
y_predicted = l_svc.predict(X_test_scaled)
confusion = confusion_matrix(y_test, y_predicted)
df_cm = pd.DataFrame(confusion)
#sns.set(font_scale=1.4)#for label size
plt.figure(figsize = (10,7))
sns.heatmap(df_cm)# font size
Out[47]:
In [48]:
from sklearn.linear_model import LogisticRegression
import numpy as np
scores = []
rng = [0.1,1,3,5,10,15]
for c in rng:
lr = LogisticRegression(C=c)
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
scores.append(accuracy_score(y_test, y_pred))
#print(score)
plt.plot(rng, scores)
plt.title('Logistic Regression')
plt.xlabel('C')
plt.ylabel('Accuracy score')
plt.show()
acc_lr = max(scores)
c_lr = rng[scores.index(max(scores))]
#
lr = LinearSVC(C=c_svc)
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
f1_lr = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy is {}'.format(acc_lr))
print('Best parameter is c = {}'.format(c_lr))
print('F1 score is {}'.format(f1_lr))
In [49]:
y_predicted = lr.predict(X_test_scaled)
confusion = confusion_matrix(y_test, y_predicted)
df_cm = pd.DataFrame(confusion)
#sns.set(font_scale=1.4)#for label size
plt.figure(figsize = (10,7))
sns.heatmap(df_cm)# font size
Out[49]:
In [50]:
models = pd.DataFrame({
'Model': ['Linear SVC', 'KNN', 'Random Forest', 'AdaBoost',
'Logistic Regression', 'Decision Tree'],
'Score': [acc_svc,acc_knn, acc_forest, acc_boost,
acc_lr, acc_tree],
'F1 Score':[f1_svc, f1_knn, f1_forest, f1_boost,
f1_lr, f1_tree]})
models.sort_values(by='Score', ascending=False)
Out[50]: