In [1]:
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os,sys
import time
from datetime import date
try:
import cPickle as pickle
except:
import pickle
import pprint
from collections import deque
from shutil import copyfile
import random
In [2]:
# %load Gender_Prediction.py
# Import Libraries
import os
import re
import urllib2
from zipfile import ZipFile
import csv
import cPickle as pickle
def downloadNames():
u = urllib2.urlopen('https://www.ssa.gov/oact/babynames/names.zip')
localFile = open('names.zip', 'w')
localFile.write(u.read())
localFile.close()
def getNameList():
if not os.path.exists('names.pickle'):
print 'names.pickle does not exist, generating'
# https://www.ssa.gov/oact/babynames/names.zip
if not os.path.exists('names.zip'):
print 'names.zip does not exist, downloading from github'
downloadNames()
else:
print 'names.zip exists, not downloading'
print 'Extracting names from names.zip'
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
print 'Sorting Names'
for name in namesDict:
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
print 'Saving names.pickle'
fw=open('names.pickle','wb')
pickle.dump(names,fw,-1)
fw.close()
print 'Saved names.pickle'
else:
print 'names.pickle exists, loading data'
f=open('names.pickle','rb')
names=pickle.load(f)
print 'names.pickle loaded'
print '%d male names loaded, %d female names loaded'%(len(names[0]),len(names[1]))
return names[0],names[1]
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
fp = zf.open(filename,'rU')
rows=csv.reader(fp, delimiter=',', dialect=csv.excel_tab)
try:
for row in rows:
#print name,row[1]
try:
name=row[0].upper()
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
except:
pass
except:
pass
fp.close()
print '\tImported %s'%filename
return names
def find_gender_from_first_name(name):
if name.upper() in new_male_list:
return "Male"
elif name.upper() in new_female_list:
return "Female"
else:
return "Unknown"
if __name__ == '__main__':
male_names, female_names = getNameList()
new_male_list = []
new_female_list = []
for index,name in enumerate(male_names):
try:
if (name[1]/name[2])>=4:
new_male_list.append(name[0])
except:
new_male_list.append(name[0])
#print "Total number of Male Names after is %d." %len(new_male_list)
for index,name in enumerate(female_names):
try:
if (name[2]/name[1])>=4:
new_female_list.append(name[0])
except:
new_female_list.append(name[0])
#print "Total number of Female Names after is %d." %len(new_female_list)
#find_gender_from_first_name('Harsh')
In [3]:
def getProfilePicLink(html):
soup=BeautifulSoup(html,"lxml")
images = [x for x in soup.find_all('img')]
#print images
try:
if "shrinknp_200_200" in str(images[0]):
imageUrlString = str(images[0]).replace("shrinknp_200_200", "shrinknp_400_400")
else:
imageUrlString = ""
except:
imageUrlString = ""
#print imageUrlString
return imageUrlString
In [4]:
def storeProfilePicture(profileUrl,profile_link):
lst = profileUrl.split()
userId = profile_link.split('/')[-1]
regex=re.compile(r'(src).*')
img_url = re.sub('src=','', "".join([m.group(0) for l in lst for m in [regex.search(l)] if m]))
img_url = img_url.strip('""')
#print img_url
if img_url:
urllib.urlretrieve(img_url, "Images/" + userId + ".jpg")
print userId + ".jpg is saved."
return img_url
else:
with open('ghost_person.png', 'rb') as f:
data = f.read()
with open("Images/" + userId + ".png", 'wb') as f:
f.write(data)
print userId + ".png is saved."
return "https://static.licdn.com/scds/common/u/images/themes/katy/ghosts/person/ghost_person_100x100_v1.png".strip("")
In [5]:
def getRecommendedUserIds(html):
soup=BeautifulSoup(html,"lxml")
#content = driver.page_source
#images = [x for x in soup.find_all('img')]
profLinks = [x for x in soup.find_all('li',{'class': 'profile-card'})]
recUserIds = []
#print profLinks
for link in profLinks:
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(link))
recId = urls[0].split('?')[0].split('/')[-1]
recUserIds.append(recId)
return recUserIds
In [6]:
def getName(html):
soup=BeautifulSoup(html,"lxml")
title = soup.find('title')
name = str(title).replace('<title>','')
full_name = name.replace(' | LinkedIn</title>','')
return full_name
In [7]:
def getBachelorList():
BachelorDict = {}
regex = re.compile('[^a-zA-Z/]')
with open('bachelor_degrees.txt') as fp:
for line in fp.readlines():
lineSepator = line.split('(')
abbr = regex.sub('', lineSepator[1])
abbr = abbr.split('/')
for abrv in abbr:
BachelorDict[abrv] = lineSepator[0].strip()
fp.close()
return BachelorDict
In [8]:
def calculate_age(bachelor_year):
today = date.today()
return today.year - bachelor_year + 18
In [9]:
def get_Degree_Duration(html):
soup=BeautifulSoup(html.encode("ascii","ignore"),"lxml")
schoolLinks = soup.find_all('li',{'class':'school'})
degreeList = []
time_range_list = []
#print schoolLinks
for soup1 in schoolLinks:
degreeLink = soup1.find('span',{'data-field-name':"Education.DegreeName"})
#print degreeLink
timeRange = soup1.find('span',{'class':"date-range"})
#print timeRange
tempDegree = str(degreeLink).replace('<span class="translated translation" data-field-name="Education.DegreeName">','')
degree = tempDegree.replace('</span>','')
tempTime = str(timeRange).replace('<span class="date-range">','')
time = tempTime.replace('<time>','')
temp_time = time.replace('</time>','')
time_range = temp_time.replace('</span>','')
#print time_range
degreeList.append(degree)
time_range_list.append(time_range)
return degreeList,time_range_list
In [10]:
def find_bachelor_year(degree_list,time_list):
bachelor_degree_duration = set()
BachelorDict = getBachelorList()
for index,dg in enumerate(degree_list):
for key in BachelorDict.keys():
if key in dg:
bachelor_degree_duration.add(time_list[index])
break
for value in BachelorDict.values():
if value in dg:
bachelor_degree_duration.add(time_list[index])
break
#print time_list
bachelor_degree_duration = list(bachelor_degree_duration)
#print bachelor_degree_duration
#print time_list[0]
try:
if not bachelor_degree_duration:
if time_list[0]:
bachelor_year = int(time_list[0].split()[0]) - 5
else:
bachelor_year = None
else:
bachelor_year = int(bachelor_degree_duration[0].split()[0])
except:
bachelor_year = None
return bachelor_year
In [11]:
def age_from_linkedin_profile(profileUrl):
try:
degree_list, time_list = get_Degree_Duration(profileUrl)
refined_degree_list = []
regex = re.compile('[^a-zA-Z\s+]')
for degree in degree_list:
refined_degree_list.append(regex.sub('',degree))
bachelor_year = find_bachelor_year(refined_degree_list,time_list)
#print bachelor_year
if bachelor_year:
age = calculate_age(bachelor_year)
else:
age = None
except:
age = None
return age
In [12]:
def MakeProfileDictionary(usrid):
#recommended_profile_ids = []
profileUrl = "https://www.linkedin.com/in/" + usrid
driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
driver.get(profileUrl)
html=driver.page_source
if "Parse the tracking code from cookies." in html:
return
else:
with open("Profile_Source/" + usrid + ".txt", 'wb') as fp:
fp.write(html.encode('utf-8'))
fp.close()
profileDict = {}
profileDict['User_ID'] = usrid
profileDict['Full_Name'] = getName(html)
profileDict['Gender'] = find_gender_from_first_name(getName(html).split()[0])
recommended_profile_ids = getRecommendedUserIds(html)
profileDict['Recommended_Ids'] = recommended_profile_ids
profilePicUrl = getProfilePicLink(html)
picUrl = storeProfilePicture(profilePicUrl,profileUrl)
profileDict['Profile_Url'] = picUrl
profileDict['age'] = age_from_linkedin_profile(html)
return profileDict,recommended_profile_ids
In [48]:
copyfile('linkedin_UserIds.pickle','Temp_Files/linkedin_UserIds.pickle')
copyfile('linkedin_Black_Listed_UserIds.pickle','Temp_Files/linkedin_Black_Listed_UserIds.pickle')
copyfile('linkedin_profiles.pickle','Temp_Files/linkedin_profiles.pickle')
#copyfile('linkedin_profiles_temp.pickle','Temp_Files/linkedin_profiles_temp.pickle')
In [49]:
pkl_file = open("linkedin_UserIds.pickle","rb")
userIds_list=pickle.load(pkl_file) # errors out here
pkl_file.close()
pkl_file = open("linkedin_Black_Listed_UserIds.pickle","rb")
black_listed_userids=pickle.load(pkl_file) # errors out here
pkl_file.close()
pkl_fl = open("linkedin_profiles.pickle","rb")
my_original_list=pickle.load(pkl_fl) # errors out here
pkl_fl.close()
In [50]:
len(my_original_list)
Out[50]:
In [51]:
len(black_listed_userids)
Out[51]:
In [ ]:
if __name__ == '__main__':
directory = "Images"
if not os.path.exists(directory):
os.makedirs(directory)
directory1 = "Profile_Source"
if not os.path.exists(directory1):
os.makedirs(directory1)
#userIds = deque(["harshparikh1001","marissamayer","williamhgates","mbilgic"])
#userIds = ["harshparikh1001"]
#recommended_profile_ids = []
uids = [uid for uid in (d['User_ID'] for d in my_original_list)]
uniqueIds = list(set(uids))
userIds = userIds_list
profiles = []
output = open("linkedin_profiles_temp.pickle", 'wb') # Save all profiles as pickle file
count = 0
temp_count = 0
last_call = 0
#black_listed_userids = []
while count != 100:
usrid = userIds.popleft()
#userIds.remove(usrid)
temp_count+=1
print temp_count
if (usrid not in uniqueIds) and (usrid not in black_listed_userids):
try:
profileDict,recommed_id_list = MakeProfileDictionary(usrid)
userIds.extend(recommed_id_list)
if (profileDict['age']!=None) and (profileDict['Profile_Url'].endswith('.jpg')):
count += 1
print count
profiles.append(profileDict)
else:
black_listed_userids.append(usrid)
time.sleep(5) # delays for 5 seconds
#if temp_count % 10 == 0:
# time.sleep(100) # delays for 100 seconds
except:
try:
print "\n\n*******Your IP got tracked... Wait for sometime..*******\n\n"
last_call += 1
#if last_call>=5:
# print "\nPlease try again later.. :)"
# break
#time.sleep(300)
mins=0
while mins != 5:
print ">>>>>>>>>>>>>>>>>>>>>", (mins)
time.sleep(60)
# Increment the minute total
mins += 1
# Bring up the dialog box here
pass
except:
break
pickle.dump(profiles, output,-1)
output.close()
In [53]:
pkl_fl = open("linkedin_UserIds.pickle","wb")
pickle.dump(userIds,pkl_fl,-1) # errors out here
pkl_fl.close()
pk_fl = open("linkedin_Black_Listed_UserIds.pickle","wb")
pickle.dump(black_listed_userids,pk_fl,-1) # errors out here
pk_fl.close()
pkl_file = open("linkedin_profiles_temp.pickle","rb")
temp_list=pickle.load(pkl_file) # errors out here
pkl_file.close()
In [54]:
len(temp_list)
Out[54]:
In [44]:
profile_list = my_original_list + temp_list
In [45]:
len(profile_list)
Out[45]:
In [46]:
import os
for file in os.listdir("Images"):
file_path = os.path.join("Images", file)
try:
if not file.endswith('.jpg'):
os.unlink(file_path)
#elif os.path.isdir(file_path): shutil.rmtree(file_path)
except Exception as e:
print(e)
In [47]:
pkl_file = open("linkedin_profiles.pickle","wb")
pickle.dump(profile_list, pkl_file,-1) # errors out here
pkl_file.close()