Import Libraries


In [1]:
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os,sys
import time
from datetime import date
try:
    import cPickle as pickle
except:
    import pickle
import pprint
from collections import deque
from shutil import copyfile
import random

Loading Gender Predictor File to predict Gender of LinkedIn Profiles


In [2]:
# %load Gender_Prediction.py

# Import Libraries

import os
import re
import urllib2
from zipfile import ZipFile
import csv
import cPickle as pickle

def downloadNames():
    u = urllib2.urlopen('https://www.ssa.gov/oact/babynames/names.zip')
    localFile = open('names.zip', 'w')
    localFile.write(u.read())
    localFile.close()

def getNameList():
    if not os.path.exists('names.pickle'):
        print 'names.pickle does not exist, generating'
        
        # https://www.ssa.gov/oact/babynames/names.zip
        
        if not os.path.exists('names.zip'):
            print 'names.zip does not exist, downloading from github'
            downloadNames()
        else:
            print 'names.zip exists, not downloading'
        
        print 'Extracting names from names.zip'  
        
        namesDict=extractNamesDict()
        
        maleNames=list()
        femaleNames=list()
        
        print 'Sorting Names'
        
        for name in namesDict:
            counts=namesDict[name]
            tuple=(name,counts[0],counts[1])
            if counts[0]>counts[1]:
                maleNames.append(tuple)
            elif counts[1]>counts[0]:
                femaleNames.append(tuple)
        
        names=(maleNames,femaleNames)
        
        print 'Saving names.pickle'
        fw=open('names.pickle','wb')
        pickle.dump(names,fw,-1)
        fw.close()
        print 'Saved names.pickle'
    else:
        print 'names.pickle exists, loading data'
        f=open('names.pickle','rb')
        names=pickle.load(f)
        print 'names.pickle loaded'
        
    print '%d male names loaded, %d female names loaded'%(len(names[0]),len(names[1]))
    
    return names[0],names[1]

def extractNamesDict():
    zf=ZipFile('names.zip', 'r')
    filenames=zf.namelist()
    
    names=dict()
    genderMap={'M':0,'F':1}
    
    for filename in filenames:
        fp = zf.open(filename,'rU')
        rows=csv.reader(fp, delimiter=',', dialect=csv.excel_tab)
        try:
            for row in rows:
            #print name,row[1]
                try:
                    name=row[0].upper()
                    gender=genderMap[row[1]]
                    count=int(row[2])

                    if not names.has_key(name):
                        names[name]=[0,0]

                    names[name][gender]=names[name][gender]+count
                except:
                    pass
        except:
            pass
            
        fp.close()
        
        print '\tImported %s'%filename
    return names


def find_gender_from_first_name(name):
    if name.upper() in new_male_list:
        return "Male"
    elif name.upper() in new_female_list:
        return "Female"
    else:
        return "Unknown"    

if __name__ == '__main__':
    
	male_names, female_names = getNameList()
	new_male_list = []
	new_female_list = []

	for index,name in enumerate(male_names):
		try:
		    if (name[1]/name[2])>=4:
		        new_male_list.append(name[0])
		except:
		    new_male_list.append(name[0])
		    
	#print "Total number of Male Names after is %d." %len(new_male_list)	
	
	
	for index,name in enumerate(female_names):
		try:
		    if (name[2]/name[1])>=4:
		        new_female_list.append(name[0])
		except:
		    new_female_list.append(name[0])
		    
	#print "Total number of Female Names after is %d." %len(new_female_list)
	
	#find_gender_from_first_name('Harsh')


names.pickle exists, loading data
names.pickle loaded
34722 male names loaded, 60185 female names loaded

In [3]:
def getProfilePicLink(html):
       
    soup=BeautifulSoup(html,"lxml") 
    images = [x for x in soup.find_all('img')]
    #print images
    try:
        if "shrinknp_200_200" in str(images[0]):
            imageUrlString = str(images[0]).replace("shrinknp_200_200", "shrinknp_400_400")
        else:
            imageUrlString = ""
    except:
        imageUrlString = ""
    
    #print imageUrlString
    
    return imageUrlString

Store Profile Picture to Local Directory


In [4]:
def storeProfilePicture(profileUrl,profile_link):
    
    lst = profileUrl.split()
    userId = profile_link.split('/')[-1]
    regex=re.compile(r'(src).*')
    img_url = re.sub('src=','', "".join([m.group(0) for l in lst for m in [regex.search(l)] if m]))
    img_url = img_url.strip('""')
    #print img_url
    if img_url:
        urllib.urlretrieve(img_url, "Images/" + userId + ".jpg")
        print userId + ".jpg is saved."
        return img_url
    else:
        with open('ghost_person.png', 'rb') as f:
            data = f.read()

        with open("Images/" + userId + ".png", 'wb') as f:
            f.write(data)

        print userId + ".png is saved."
        return "https://static.licdn.com/scds/common/u/images/themes/katy/ghosts/person/ghost_person_100x100_v1.png".strip("")

In [5]:
def getRecommendedUserIds(html):
    
    
    soup=BeautifulSoup(html,"lxml")
    #content = driver.page_source
    #images = [x for x in soup.find_all('img')]
    profLinks = [x for x in soup.find_all('li',{'class': 'profile-card'})]
    
    recUserIds = []
    #print profLinks
    for link in profLinks:
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(link))
        recId = urls[0].split('?')[0].split('/')[-1]
        recUserIds.append(recId)
    
    return recUserIds

Get Full Name from title in source


In [6]:
def getName(html):
    
    soup=BeautifulSoup(html,"lxml")
    title = soup.find('title')
    name = str(title).replace('<title>','')
    full_name = name.replace(' | LinkedIn</title>','')
    
    return full_name

Get All Bachelor Degree List and Make a dictionary


In [7]:
def getBachelorList():

    BachelorDict = {}
    regex = re.compile('[^a-zA-Z/]')
    with open('bachelor_degrees.txt') as fp:
        for line in fp.readlines():
            lineSepator = line.split('(')
            abbr = regex.sub('', lineSepator[1])
            abbr = abbr.split('/')
            for abrv in abbr:
                BachelorDict[abrv] = lineSepator[0].strip()
        fp.close()

    return BachelorDict

Calculate Age from Bachelor Degree Starting Year


In [8]:
def calculate_age(bachelor_year):
    today = date.today()
    return today.year - bachelor_year + 18

Find Person's All Degrees and their Duration


In [9]:
def get_Degree_Duration(html):
    
    soup=BeautifulSoup(html.encode("ascii","ignore"),"lxml")
    schoolLinks = soup.find_all('li',{'class':'school'})
    degreeList = []
    time_range_list = []
    #print schoolLinks
    
    for soup1 in schoolLinks:

        degreeLink = soup1.find('span',{'data-field-name':"Education.DegreeName"})
        #print degreeLink
        timeRange = soup1.find('span',{'class':"date-range"})
        #print timeRange
        tempDegree = str(degreeLink).replace('<span class="translated translation" data-field-name="Education.DegreeName">','')
        degree = tempDegree.replace('</span>','')

        tempTime = str(timeRange).replace('<span class="date-range">','')
        time = tempTime.replace('<time>','')
        temp_time = time.replace('</time>','')
        time_range = temp_time.replace('</span>','')
        #print time_range
        degreeList.append(degree)
        time_range_list.append(time_range)
    
    return degreeList,time_range_list

Find the Bachelor Year from Degree List and its Duration


In [10]:
def find_bachelor_year(degree_list,time_list):
    
    bachelor_degree_duration = set()
    
    BachelorDict = getBachelorList()
    
    for index,dg in enumerate(degree_list):

        for key in BachelorDict.keys():
            if key in dg:
                bachelor_degree_duration.add(time_list[index])
                break

        for value in BachelorDict.values():
            if value in dg:
                bachelor_degree_duration.add(time_list[index])
                break
        
        
    #print time_list   
    bachelor_degree_duration = list(bachelor_degree_duration)
    
    #print bachelor_degree_duration
    #print time_list[0]
    try:
        if not bachelor_degree_duration:
            if time_list[0]:
                
                bachelor_year = int(time_list[0].split()[0]) - 5
            else:
                bachelor_year = None

        else:
            bachelor_year = int(bachelor_degree_duration[0].split()[0])    
    
    except:
        bachelor_year = None
    

    return bachelor_year

Find the age from LinkedIn Profile


In [11]:
def age_from_linkedin_profile(profileUrl):
    
    try:
        degree_list, time_list = get_Degree_Duration(profileUrl)
    
        refined_degree_list = []
        regex = re.compile('[^a-zA-Z\s+]')

        for degree in degree_list:
            refined_degree_list.append(regex.sub('',degree))

        bachelor_year = find_bachelor_year(refined_degree_list,time_list)
        #print bachelor_year

        if bachelor_year:
            age = calculate_age(bachelor_year)
        else:
            age = None
    
    except:
        age = None
        
    return age

In [12]:
def MakeProfileDictionary(usrid):
    
    #recommended_profile_ids = []
    profileUrl = "https://www.linkedin.com/in/" + usrid
        
    driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
    driver.get(profileUrl)
    html=driver.page_source
    
    if "Parse the tracking code from cookies." in html:
        return
    
    else:
        with open("Profile_Source/" + usrid + ".txt", 'wb') as fp:
                fp.write(html.encode('utf-8'))

        fp.close()


        profileDict = {}

        profileDict['User_ID'] = usrid
        profileDict['Full_Name'] = getName(html)
        profileDict['Gender'] = find_gender_from_first_name(getName(html).split()[0])
        recommended_profile_ids = getRecommendedUserIds(html)
        profileDict['Recommended_Ids'] = recommended_profile_ids

        profilePicUrl = getProfilePicLink(html)

        picUrl = storeProfilePicture(profilePicUrl,profileUrl)

        profileDict['Profile_Url'] = picUrl
        profileDict['age'] = age_from_linkedin_profile(html)

        return profileDict,recommended_profile_ids

Copying Files for the backup


In [48]:
copyfile('linkedin_UserIds.pickle','Temp_Files/linkedin_UserIds.pickle')
copyfile('linkedin_Black_Listed_UserIds.pickle','Temp_Files/linkedin_Black_Listed_UserIds.pickle')
copyfile('linkedin_profiles.pickle','Temp_Files/linkedin_profiles.pickle')
#copyfile('linkedin_profiles_temp.pickle','Temp_Files/linkedin_profiles_temp.pickle')

Load Values from Pickle File


In [49]:
pkl_file = open("linkedin_UserIds.pickle","rb")
userIds_list=pickle.load(pkl_file) # errors out here
pkl_file.close()

pkl_file = open("linkedin_Black_Listed_UserIds.pickle","rb")
black_listed_userids=pickle.load(pkl_file) # errors out here
pkl_file.close()


pkl_fl = open("linkedin_profiles.pickle","rb")
my_original_list=pickle.load(pkl_fl) # errors out here
pkl_fl.close()

In [50]:
len(my_original_list)


Out[50]:
2670

In [51]:
len(black_listed_userids)


Out[51]:
3931

Main Method


In [ ]:
if __name__ == '__main__':
    
    directory = "Images"
    
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    directory1 = "Profile_Source"
    
    if not os.path.exists(directory1):
        os.makedirs(directory1)    
    
    #userIds = deque(["harshparikh1001","marissamayer","williamhgates","mbilgic"])
    #userIds = ["harshparikh1001"]
    #recommended_profile_ids = []
    
    uids = [uid for uid in (d['User_ID'] for d in my_original_list)]
    uniqueIds = list(set(uids))
    
    userIds = userIds_list
    
    profiles = []
    
    output = open("linkedin_profiles_temp.pickle", 'wb')   # Save all profiles as pickle file
    
    count = 0
    temp_count = 0
    last_call = 0
    #black_listed_userids = []
    
    while count != 100:
        
        usrid = userIds.popleft()
        #userIds.remove(usrid)
        temp_count+=1
        print temp_count
        
        if (usrid not in uniqueIds) and (usrid not in black_listed_userids):
            
            try:
                
                profileDict,recommed_id_list = MakeProfileDictionary(usrid)
                
                userIds.extend(recommed_id_list)
                
                if (profileDict['age']!=None) and (profileDict['Profile_Url'].endswith('.jpg')):
                    count += 1
                    print count
                    profiles.append(profileDict)
                else:
                    black_listed_userids.append(usrid)
                    
                time.sleep(5)     # delays for 5 seconds

                #if temp_count % 10 == 0:
                #    time.sleep(100) # delays for 100 seconds
            
            except:
                try:
                    print "\n\n*******Your IP got tracked... Wait for sometime..*******\n\n"

                    last_call += 1

                    #if last_call>=5:
                    #    print "\nPlease try again later.. :)"
                    #    break

                    #time.sleep(300)
                    
                    
                    mins=0


                    while mins != 5:
                        print ">>>>>>>>>>>>>>>>>>>>>", (mins)

                        time.sleep(60)
                        # Increment the minute total
                        mins += 1
                        # Bring up the dialog box here
                    

                    pass
                
                except:
                    break
        
    pickle.dump(profiles, output,-1)
    output.close()


1


*******Your IP got tracked... Wait for sometime..*******


>>>>>>>>>>>>>>>>>>>>> 0
>>>>>>>>>>>>>>>>>>>>>

Open pickle file and read stuff from it


In [53]:
pkl_fl = open("linkedin_UserIds.pickle","wb")
pickle.dump(userIds,pkl_fl,-1) # errors out here
pkl_fl.close()

pk_fl = open("linkedin_Black_Listed_UserIds.pickle","wb")
pickle.dump(black_listed_userids,pk_fl,-1) # errors out here
pk_fl.close()

pkl_file = open("linkedin_profiles_temp.pickle","rb")
temp_list=pickle.load(pkl_file) # errors out here
pkl_file.close()

In [54]:
len(temp_list)


Out[54]:
0

In [44]:
profile_list = my_original_list + temp_list

In [45]:
len(profile_list)


Out[45]:
2670

In [46]:
import os
for file in os.listdir("Images"):
    file_path = os.path.join("Images", file)
    try:
        if not file.endswith('.jpg'):
            os.unlink(file_path)
        #elif os.path.isdir(file_path): shutil.rmtree(file_path)
    except Exception as e:
        print(e)

In [47]:
pkl_file = open("linkedin_profiles.pickle","wb")
pickle.dump(profile_list, pkl_file,-1) # errors out here
pkl_file.close()