In [ ]:
# Import everything we need
import pandas as pd
from titlecase import titlecase
In [ ]:
# Import the LinkedIn and Twitter data into individual pandas dataframes
# Define the paths to the files
# If you named your files differently, change the names
linkedin_data_file = 'linkedin_connection_data.csv'
twitter_data_file = 'twitter_user_data.csv'
# Create the dataframes by reading from the CSV files
ldf = pd.read_csv(linkedin_data_file)
tdf = pd.read_csv(twitter_data_file)
In [ ]:
print(ldf)
In [ ]:
print(tdf)
In [ ]:
# Do some minor formatting of the data so we can join the dataframes together
# Titlecase anything
def titlecase_anything(thing):
try:
thing = titlecase(thing)
except:
pass
return thing
# Create a full name from the first and last
def create_full_name(first_name, last_name):
return (first_name + " " + last_name)
# Here's why we're using pandas - to apply our functions to the entire dataframe at once
# First we'll titlecase the first_name and last_name columns that came from LinkedIn
# Note that we don't have to pass in the variable that the method requires - it happens automatically
ldf.first_name = ldf.first_name.apply(titlecase_anything)
ldf.last_name = ldf.last_name.apply(titlecase_anything)
# Second, because Twitter provides us a full name, if the user provided it, we'll create a full_name in our LI data
ldf['name'] = ldf.first_name + " " + ldf.last_name
In [ ]:
print(tdf)
In [ ]:
print(ldf)
In [ ]:
# Perform a database-style left join on our dataframes using the merge function from pandas
# Because we're doing a left join only matching records will be merged. All others are filled in with NaN
mdf = tdf.merge(ldf, on='name', how='left')
mdf # Fooled you here. In iPython Notebook we don't have to call the print function to see our variables
In [ ]:
# Let's turn our new dataset into a CSV file. One liners FTW!
# Note that we're keeping even the "unmatched" data so that perhaps we can fill it in later.
mdf.to_csv('twitter_and_linkedin_merged_data.csv', sep=',', encoding='utf-8')
print("CSV creation complete")
In [ ]: