In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pickle
import datetime
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

In [314]:
binary = FirefoxBinary(r'/usr/bin/firefox')
caps = DesiredCapabilities.FIREFOX.copy()
#Set ‘marionette’ browser to True
caps['marionette'] = True
#Launch the Firefox instance by specifying the geckodriver executable path
#driver = webdriver.Firefox(firefox_binary=binary,capabilities=caps)
#driver.wait = WebDriverWait(driver, 5)

In [242]:
def formatOriginalNameToWikiName(originalname):
    """
    return goodname if we return a better format from wiki
    if not we just return empty string
    we need this function,since if not formated yet, most of the time when you search wiki with the bad 
    name in the url it will return nothing.
    """
    wikipedia.set_lang("en")
    allWikiResults=wikipedia.search(originalname)
    if(len(allWikiResults)==0):
        return ""
    else:
        return str(wikipedia.search(originalname)[0])

In [317]:
#inorder to be parallel we need to pass in a new driver
def MakeSeleniumToSearchWithOriginalName(originalname,driver):
    wikiname=formatOriginalNameToWikiName(originalname)
    nametosearch=originalname if(wikiname=="") else wikiname
    driver.get("https://en.wikipedia.org/wiki/"+str(nametosearch))
    result={}
    result["findresult"]={}
    result["nofind"]={}
    try:
        elem = driver.find_element_by_css_selector(".interwiki-ar a")
        """
        these two lines of code needs to run before elem.click(), since it will goto
        another page never find it any more.
        """
        tempdic={}
        #print("me"+str(elem.get_attribute("href")))
        tempdic["arurl"]=str(elem.get_attribute("href"))
        elem.click()
        tempdic["originalname"]=originalname
        tempdic["wikiname"]=wikiname
        firstheading=driver.find_element_by_id("firstHeading")
        #arabic is from left to right that why u need to get the first one that returns.
        tempdic["arname"]=firstheading.text.split("\n")[0]
        #print(tempdic["arname"])
        result["findresult"]=tempdic
    except Exception as e:
        #print(e)
        tempno={}
        tempno["originalname"]=originalname
        result["nofind"]=tempno
        pass
    return result

In [285]:
def clean_line(line):
    # Take out extra space, underscores, comments, etc.
    cleaned = re.sub("_* .+", "", line).strip()
    cleaned = re.sub("_$", "", cleaned, flags=re.MULTILINE)
    return cleaned

def ingest_dictionary(dict_path):
    """
    Read in the country (or other) actor dictionaries.
    """
    with open(dict_path) as f:
        country_file = f.read()
    split_file = country_file.split("\n")
    
    dict_dict = []
    key_name = ""
    alt_names = [] 
    roles = []

    for line in split_file:
        if not line:
            pass
        elif line[0] == "#":
            pass
        elif re.match("[A-Z]", line[0]):
            # handle the previous
            entry = {"actor_en" : key_name,
                    "alt_names_en" : alt_names,
                    "roles" : roles}
            dict_dict.append(entry)
            # zero everything out
            alt_names = []
            roles = []
            # make new key name
            key_name = clean_line(line)
            # check to see if the role is built in
            if bool(re.search("\[[A-Z]{3}\]", line)):
                roles = re.findall("\[(.+?)\]", line)
        elif line[0] == "+":
            cleaned = clean_line(line[1:])
            alt_names.append(cleaned)
        elif re.match("\s", line):
            roles.append(line.strip())
    return dict_dict 
dp = "./Phoenix.Countries.actors.txt"
dict_dict = ingest_dictionary(dp)

In [233]:
len(dict_dict)


Out[233]:
18390

In [315]:
def buildMultiLanguageActorDictionary(dict_dict):
    finalResult={}
    finalResult["goodones"]=[]
    finalResult["badones"]=[]
    driver = webdriver.Firefox(firefox_binary=binary,capabilities=caps)
    driver.wait = WebDriverWait(driver, 5)
    for item in dict_dict:
        originalname=item["actor_en"]
        if(originalname!=""):
            temp=MakeSeleniumToSearchWithOriginalName(originalname,driver)
            if(temp["findresult"]):
                finalResult["goodones"].append(temp["findresult"])
            else:
                finalResult["badones"].append(temp["nofind"])
    return finalResult

In [310]:
len(dict_dict)


Out[310]:
18390

In [318]:
from multiprocessing import Process

def func1():
      print('driver 1 start')
      rst1=buildMultiLanguageActorDictionary(dict_dict[0:40])
      wholeresults.append(rst1)
      print('driver 1 finish')

def func2():
      print('driver 2 start')
      rst2=buildMultiLanguageActorDictionary(dict_dict[40:80])
      wholeresults.append(rst2)
      print('driver 2 finish')
def func3():
      print('driver 3 start')
      rst3=buildMultiLanguageActorDictionary(dict_dict[80:120])
      wholeresults.append(rst3)
      print('driver 3 finish')
def func4():
      print('driver 4 start')
      rst4=buildMultiLanguageActorDictionary(dict_dict[120:160])#len(dict_dict)])
      wholeresults.append(rst4)
      print('driver 4 finish')
        

if __name__ == '__main__':
      wholeresults=[]
      p1 = Process(target=func1)
      p1.start()
      p2 = Process(target=func2)
      p2.start()
      p3= Process(target=func3)
      p3.start()
      p4 = Process(target=func4)
      p4.start()
      
      p1.join()
      p2.join()
      p3.join()
      p4.join()


driver 1 start
driver 2 start
driver 3 start
driver 4 start
Process Process-7:
Process Process-9:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 141, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/usr/lib/python3.5/socket.py", line 732, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 141, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
socket.gaierror: [Errno -3] Temporary failure in name resolution
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/usr/lib/python3.5/socket.py", line 732, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):

During handling of the above exception, another exception occurred:

socket.gaierror: [Errno -3] Temporary failure in name resolution
Traceback (most recent call last):

During handling of the above exception, another exception occurred:

  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 600, in urlopen
    chunked=chunked)
Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 356, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "/usr/lib/python3.5/http/client.py", line 1106, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 1151, in _send_request
    self.endheaders(body)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 356, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "/usr/lib/python3.5/http/client.py", line 1102, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.5/http/client.py", line 1106, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
    self.send(msg)
  File "/usr/lib/python3.5/http/client.py", line 1151, in _send_request
    self.endheaders(body)
  File "/usr/lib/python3.5/http/client.py", line 877, in send
    self.connect()
  File "/usr/lib/python3.5/http/client.py", line 1102, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
    self.send(msg)
  File "/usr/lib/python3.5/http/client.py", line 877, in send
    self.connect()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 166, in connect
    conn = self._new_conn()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 166, in connect
    conn = self._new_conn()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 150, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7ff21fac1898>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 440, in send
    timeout=timeout
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 649, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/retry.py", line 388, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=GIOVANNI_SPADOLINI (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21fac1898>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 150, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7ff21fab1278>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 440, in send
    timeout=timeout
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 649, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "<ipython-input-318-da492f6c002b>", line 16, in func3
    rst3=buildMultiLanguageActorDictionary(dict_dict[8000:12000])
  File "<ipython-input-315-5b46402ef819>", line 10, in buildMultiLanguageActorDictionary
    temp=MakeSeleniumToSearchWithOriginalName(originalname,driver)
  File "<ipython-input-317-df5ad56cea67>", line 3, in MakeSeleniumToSearchWithOriginalName
    wikiname=formatOriginalNameToWikiName(originalname)
  File "<ipython-input-242-916760ec7eb8>", line 9, in formatOriginalNameToWikiName
    allWikiResults=wikipedia.search(originalname)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/util.py", line 28, in __call__
    ret = self._cache[key] = self.fn(*args, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 103, in search
    raw_results = _wiki_request(search_params)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 737, in _wiki_request
    r = requests.get(API_URL, params=params, headers=headers)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 72, in get
    return request('get', url, params=params, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/retry.py", line 388, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 58, in request
    return session.request(method=method, url=url, **kwargs)
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=LIAMINE_Z%1AROUAL (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21fab1278>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 502, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-318-da492f6c002b>", line 5, in func1
    rst1=buildMultiLanguageActorDictionary(dict_dict[0:4000])
  File "<ipython-input-315-5b46402ef819>", line 10, in buildMultiLanguageActorDictionary
    temp=MakeSeleniumToSearchWithOriginalName(originalname,driver)
  File "<ipython-input-317-df5ad56cea67>", line 3, in MakeSeleniumToSearchWithOriginalName
    wikiname=formatOriginalNameToWikiName(originalname)
  File "<ipython-input-242-916760ec7eb8>", line 9, in formatOriginalNameToWikiName
    allWikiResults=wikipedia.search(originalname)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/util.py", line 28, in __call__
    ret = self._cache[key] = self.fn(*args, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 103, in search
    raw_results = _wiki_request(search_params)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 612, in send
    r = adapter.send(request, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 737, in _wiki_request
    r = requests.get(API_URL, params=params, headers=headers)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 72, in get
    return request('get', url, params=params, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 504, in send
    raise ConnectionError(e, request=request)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 58, in request
    return session.request(method=method, url=url, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 502, in request
    resp = self.send(prep, **send_kwargs)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=GIOVANNI_SPADOLINI (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21fac1898>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 612, in send
    r = adapter.send(request, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 504, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=LIAMINE_Z%1AROUAL (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21fab1278>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))
Process Process-10:
Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 141, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/usr/lib/python3.5/socket.py", line 732, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 356, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "/usr/lib/python3.5/http/client.py", line 1106, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 1151, in _send_request
    self.endheaders(body)
  File "/usr/lib/python3.5/http/client.py", line 1102, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
    self.send(msg)
  File "/usr/lib/python3.5/http/client.py", line 877, in send
    self.connect()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 166, in connect
    conn = self._new_conn()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 150, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7ff21fac11d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 440, in send
    timeout=timeout
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 649, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/retry.py", line 388, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=DAVID_OSBORNE_HAY (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21fac11d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-318-da492f6c002b>", line 21, in func4
    rst4=buildMultiLanguageActorDictionary(dict_dict[12000:len(dict_dict)])
  File "<ipython-input-315-5b46402ef819>", line 10, in buildMultiLanguageActorDictionary
    temp=MakeSeleniumToSearchWithOriginalName(originalname,driver)
  File "<ipython-input-317-df5ad56cea67>", line 3, in MakeSeleniumToSearchWithOriginalName
    wikiname=formatOriginalNameToWikiName(originalname)
  File "<ipython-input-242-916760ec7eb8>", line 9, in formatOriginalNameToWikiName
    allWikiResults=wikipedia.search(originalname)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/util.py", line 28, in __call__
    ret = self._cache[key] = self.fn(*args, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 103, in search
    raw_results = _wiki_request(search_params)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 737, in _wiki_request
    r = requests.get(API_URL, params=params, headers=headers)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 72, in get
    return request('get', url, params=params, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 58, in request
    return session.request(method=method, url=url, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 502, in request
    resp = self.send(prep, **send_kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 612, in send
    r = adapter.send(request, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 504, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=DAVID_OSBORNE_HAY (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21fac11d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))
Process Process-8:
Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 141, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/usr/lib/python3.5/socket.py", line 732, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 356, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "/usr/lib/python3.5/http/client.py", line 1106, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 1151, in _send_request
    self.endheaders(body)
  File "/usr/lib/python3.5/http/client.py", line 1102, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
    self.send(msg)
  File "/usr/lib/python3.5/http/client.py", line 877, in send
    self.connect()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 166, in connect
    conn = self._new_conn()
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connection.py", line 150, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7ff21f9c9e48>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 440, in send
    timeout=timeout
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/connectionpool.py", line 649, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/urllib3/util/retry.py", line 388, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=GUILLERMO_RODRIGUEZ_LARA (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21f9c9e48>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-318-da492f6c002b>", line 11, in func2
    rst2=buildMultiLanguageActorDictionary(dict_dict[4000:8000])
  File "<ipython-input-315-5b46402ef819>", line 10, in buildMultiLanguageActorDictionary
    temp=MakeSeleniumToSearchWithOriginalName(originalname,driver)
  File "<ipython-input-317-df5ad56cea67>", line 3, in MakeSeleniumToSearchWithOriginalName
    wikiname=formatOriginalNameToWikiName(originalname)
  File "<ipython-input-242-916760ec7eb8>", line 9, in formatOriginalNameToWikiName
    allWikiResults=wikipedia.search(originalname)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/util.py", line 28, in __call__
    ret = self._cache[key] = self.fn(*args, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 103, in search
    raw_results = _wiki_request(search_params)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/wikipedia/wikipedia.py", line 737, in _wiki_request
    r = requests.get(API_URL, params=params, headers=headers)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 72, in get
    return request('get', url, params=params, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/api.py", line 58, in request
    return session.request(method=method, url=url, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 502, in request
    resp = self.send(prep, **send_kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/sessions.py", line 612, in send
    r = adapter.send(request, **kwargs)
  File "/home/yan/python_vir/env/lib/python3.5/site-packages/requests/adapters.py", line 504, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?action=query&srlimit=10&limit=10&srprop=&format=json&list=search&srsearch=GUILLERMO_RODRIGUEZ_LARA (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff21f9c9e48>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

In [210]:
test=len(wikipedia.search("obama"))

In [212]:
wikipedia.search("obama")[0]


Out[212]:
'Barack Obama'

In [198]:
MakeSeleniumToSearchWithWikiFormattedName("Mohammad Najibullah")

In [221]:
driver.get("https://en.wikipedia.org/wiki/"+"Mohammad Najibullah")

In [227]:
elem = driver.find_element_by_css_selector(".interwiki-ar a")

In [228]:
elem.get_attribute("href")


Out[228]:
'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D9%86%D8%AC%D9%8A%D8%A8_%D8%A7%D9%84%D9%84%D9%87'

In [191]:
test=driver.find_element_by_id("firstHeading")
#arabic is from left to right that why u need to get the first one that returns.
test.text.split("\n")[0]

In [192]:
#arabic is from left to right that why u need to get the first one that returns.
test.text.split("\n")[0]


Out[192]:
'محمد نجيب الله'

In [105]:
from googleapiclient.discovery import build
import pprint

my_api_key = "AIzaSyBBulleVoiDN9i8NITQqH_BUNGgyWX-nmA"
my_cse_id = "003461024781403571159:p4qrcenq1l0"

def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    #return res['spelling']['correctedQuery']
#     return res['item']
    return res['items']
results = google_search(
    'SIBGHATULLAH_MOJADEDI', my_api_key, my_cse_id, num=1)
for result in results:
     pprint.pprint(result['formattedUrl'])
#print(results)


'https://en.wikipedia.org/wiki/Sibghatullah_Mojaddedi'

In [110]:
print("\u0623\u0644\u0628\u0631\u062a \u0623\u064a\u0646\u0634\u062a\u0627\u064a\u0646")


ألبرت أينشتاين

In [117]:
import requests

payload = {'action': 'query', 'titles': 'Alert Einstein','prop':'langlinks','format':'json'}

r = requests.get("https://en.wikipedia.org/w/api.php", data=payload)
soup=BeautifulSoup(r.content,"lxml")

In [126]:
import wikipedia
print(wikipedia.summary("Wikipedia"))


Wikipedia ( or  WIK-i-PEE-dee-ə) is a free online encyclopedia with the aim to allow anyone to edit articles. Wikipedia is the largest and most popular general reference work on the Internet and is ranked among the ten most popular websites. Wikipedia is owned by the nonprofit Wikimedia Foundation.
Wikipedia was launched on January 15, 2001, by Jimmy Wales and Larry Sanger. Sanger coined its name, a portmanteau of wiki and encyclopedia. There was only the English language version initially, but it quickly developed similar versions in other languages, which differ in content and in editing practices. With 5,433,361 articles, the English Wikipedia is the largest of the more than 290 Wikipedia encyclopedias. Overall, Wikipedia consists of more than 40 million articles in more than 250 different languages and, as of February 2014, it had 18 billion page views and nearly 500 million unique visitors each month.
As of March 2017, Wikipedia has about forty thousand high-quality articles known as Featured Articles and Good Articles that cover vital topics. In 2005, Nature published a peer review comparing 42 science articles from Encyclopædia Britannica and Wikipedia, and found that Wikipedia's level of accuracy approached Encyclopædia Britannica's.
Wikipedia has been criticized for allegedly exhibiting systemic bias, presenting a mixture of "truths, half truths, and some falsehoods", and, in controversial topics, being subject to manipulation and spin.

In [127]:
wikipedia.search("Barack")


Out[127]:
['Barack Obama',
 'Barack Obama in comics',
 'Barack Obama Sr.',
 'Barack Obama: Der schwarze Kennedy',
 'List of things named after Barack Obama',
 'Inauguration of Barack Obama',
 'Bibliography of Barack Obama',
 'Barack Obama Presidential Center',
 'Timeline of the presidency of Barack Obama',
 'Barack Obama religion conspiracy theories']

In [186]:
wikipedia.set_lang("en")
test=wikipedia.page("SIBGHATULLAH_MOJADEDI").html()
soup=BeautifulSoup(test,'lxml')
hi=soup.find("li",{"class":"interwiki-ar"})

In [155]:
wikipedia.page("SIBGHATULLAH_MOJADEDI").references


Out[155]:
['http://www.worldcat.org/identities/containsVIAFID/75918762',
 'http://www.worldcat.org/oclc/123336516',
 'http://www.worldcat.org/oclc/237144347',
 'http://aviation-safety.net/database/record.php?id=19920529-0',
 'http://hrw.org/reports/2005/afghanistan0605/4.htm#_Toc105552342',
 'http://id.loc.gov/authorities/names/no97021045',
 'http://www.afghan-bios.info/index.php?option=com_afghanbios&id=1085&task=view&total=2314&start=1266&Itemid=2',
 'http://www.aftabir.com/news/article/view/2016/02/09/1139108',
 'http://www.bbc.com/pashto/afghanistan/2016/02/160215_hh-27th-anniv-soviet-forces-defeat-afg',
 'http://www.khaama.com/mojadedi-announces-the-establishment-of-a-new-political-council-9607',
 'http://www.mojaddedi.org/biography-of-sibghatullah-al-mojaddedi.html',
 'http://www.pts.af/',
 'http://www.rferl.org/content/article/1066619.html',
 'http://www.washingtontimes.com/news/2010/sep/28/afghan-peace-council-draws-fire/',
 'http://www.zmong-afghanistan.com/profiles/sibghatullah.asp',
 'https://archive.org/stream/azu_acku_risalah_ds371_2_meem46_yaa1375#page/n1/mode/1up',
 'https://books.google.com.my/books?id=1xyh_DBV1bMC&pg=PA492&lpg=PA492&dq=sibghatullah+mujaddidi+born&source=bl&ots=0-bbq_LRo5&sig=evfzzrgRMTkeWS13W4QhfaHJwe4&hl=en&sa=X&redir_esc=y#v=onepage&q=sibghatullah%20mujaddidi%20born&f=false',
 'https://books.google.com/books?id=RUSNyMH1aFQC&lpg=PR4&pg=PA406#v=onepage&q&f=false',
 'https://books.google.com/books?id=_zWhhy8L0uQC&lpg=PP1&pg=PT15#v=onepage&q&f=false',
 'https://viaf.org/viaf/75918762',
 'https://web.archive.org/web/20110606152711/http://www.zmong-afghanistan.com/profiles/sibghatullah.asp']

In [156]:
ny = wikipedia.page("New York")
#ny.title
#ny.url
#ny.links[0]
#wikipedia.set_lang("en")
wikipedia.summary("SIBGHATULLAH_MOJADEDI", sentences=2)


Out[156]:
"Sibghatullah Mojaddedi (Pashto: صبغت الله مجددی\u200e\u200e, born 21 April 1925) is a politician in Afghanistan, who served as Acting President after the fall of Mohammad Najibullah's government in April 1992. He is also the founder of the Afghan National Liberation Front, and served as the chairman of the 2003 loya jirga that approved Afghanistan's new constitution."

In [187]:
wikipedia.search("MOHAMMAD_NAJIBULLAH")


Out[187]:
['Mohammad Najibullah',
 'Mohammad Najatuallah Siddiqui',
 'Abdul Razzaq (Taliban governor)',
 'Vice President of Afghanistan',
 'Abdul Wahed Sorabi',
 'Habibia High School',
 'Ghazi High School',
 'Najib',
 'Najibullah Torwayana',
 'National Reconciliation']

In [ ]: