In [12]:
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os,sys
import time
from datetime import date
try:
    import cPickle as pickle
except:
    import pickle
import pprint
from collections import deque
from shutil import copyfile
import random

In [ ]:


In [8]:
driver = webdriver.PhantomJS(service_log_path=os.path.devnull)
driver.get("http://www.linkedin.com/in/williamhgates")
html3=driver.page_source  
print html3
soup=BeautifulSoup(html3,"lxml") 
images = [x for x in soup.find_all('img')]


<html><head>
<script type="text/javascript">
window.onload = function() {
  // Parse the tracking code from cookies.
  var trk = "sentinel_org_block";
  var cookies = document.cookie.split("; ");
  for (var i = 0; i < cookies.length; ++i) {
    if ((cookies[i].indexOf("trkCode=") == 0) && (cookies[i].length > 8)) {
      trk = cookies[i].substring(8);
    }
  }

  // Get the protocol for the redirect url.
  var protocol = "http:";
  if (window.location.protocol == "https:") {
    protocol = "https:";
  } else {
    // If "sl" cookie is set, redirect to https.
    for (var i = 0; i < cookies.length; ++i) {
      if ((cookies[i].indexOf("sl=") == 0) && (cookies[i].length > 3)) {
        window.location.href = "https:" + window.location.href.substring(window.location.protocol.length);
        return;
      }
    }
  }

  // Get the new domain. For touch.www.linkedin.com or tablet.www.linkedin.com
  // we strip "touch." and "tablet.". For international domains such as
  // fr.linkedin.com, we convert it to www.linkedin.com
  var domain = location.host;
  if (domain.substr(0, 6) == "touch.") {
    domain = domain.substr(6);
  } else if (domain.substr(0, 7) == "tablet.") {
    domain = domain.substr(7);
  } else if (domain.charAt(2) == ".") {
    domain = "www" + domain.substr(2);
  }
  
  window.location.href = "https://" + domain + "/uas/login?trk=" + trk + "&session_redirect=" +
      encodeURIComponent(protocol + "//" + domain +
      window.location.href.substr(window.location.href.search(window.location.host) +
                                  window.location.host.length));
}
</script>
</head><body>
</body></html>

In [ ]:


In [13]:
pkl_fl = open("linkedin_profiles.pickle","rb")
my_original_list1=pickle.load(pkl_fl) # errors out here
pkl_fl.close()

In [14]:
pkl_fl = open("linkedin_known_profiles.pickle","rb")
my_known_list=pickle.load(pkl_fl) # errors out here
pkl_fl.close()

In [17]:
new_profile_list = []
cnt =0
for prof in my_original_list1:
    if prof['Gender'] == 'Unknown':
        cnt+=1
        #print prof
        #new_profile_list.append(prof)
        
print cnt


2

In [16]:
import json

import urllib2

cnt=0

#myKey = "NykAhpPGmCwafdjBRo"
myKey = "VSrmyBtXtvmrasGLGJ"
newList = []

for prof in my_original_list1:
    if prof['Gender'] == 'Unknown':
        
        data = json.load(urllib2.urlopen("https://gender-api.com/get?key=" + myKey + "&name="+prof['Full_Name'].split()[0]))
        
        prof['Gender'] = data['gender'].title()
        newList.append(prof)

In [18]:
new_profile_list = []
cnt =0
for prof in my_original_list1:
    if prof['Gender'] != 'Unknown':
        cnt+=1
        #print prof
        new_profile_list.append(prof)
        
print cnt


2606

In [19]:
len(new_profile_list)


Out[19]:
2606

In [20]:
pkl_file = open("linkedin_profiles.pickle","wb")
pickle.dump(new_profile_list, pkl_file,-1) # errors out here
pkl_file.close()

In [21]:
pkl_file = open("linkedin_known_profiles.pickle","wb")
pickle.dump(newList, pkl_file,-1) # errors out here
pkl_file.close()

In [ ]: