In [68]:
import requests
from bs4 import BeautifulSoup
import json
import pandas

In [92]:
def read_file(file_name):
    """
    读取输入内容
    :param file_name:
    :return:
    """
    train_dict = {}
    with open(file_name) as f:
        current_id = ''
        for line in f:
            if '#id' in line.split(':'):
                if current_id != line.split(':')[1]:
                    current_id = line.split(':')[1].strip()
                    train_dict[current_id] = {'name':'', 'org':'', 'search_results_page':'',
                                             'homepage':'', 'pic':'', 'email':'', 'gender':'',
                                             'position':'', 'location':''}
            if '#name' in line.split(':'):
                train_dict[current_id]['name'] = line.split(':')[1].strip()
            if '#org' in line.split(':'):
                train_dict[current_id]['org'] = line.split(':')[1].strip()
            if '#search_results_page' in line.split(':'):
                train_dict[current_id]['search_results_page'] = line.split('#search_results_page:')[1].strip()

    return train_dict

In [93]:
result = read_file('small_training.txt')

In [94]:
result


Out[94]:
{'5616d8a645cedb3397b889d7': {'email': '',
  'gender': '',
  'homepage': '',
  'location': '',
  'name': 'Chen Zhang',
  'org': 'Peking University(Peking University),Beijing,China',
  'pic': '',
  'position': '',
  'search_results_page': 'http://ifang.ml:8081/5616d8a645cedb3397b889d7.html'}}

In [115]:
def scrap_from_google(words, file_name):
    """
    根据给定词语, 缓存 google 第一页搜索结果至指定文件
    https://www.google.com.hk/search?q=[words]
    :param words: 给定的关键词用空格分开
    :param file_name: 要存储的文件名字
    :return
    """
    url = 'https://www.google.com.hk/search?q=' + words
    response = requests.get(url)
    with open('results/'+file_name, 'w') as f:
        f.write(response.text)

In [116]:
for key in result:
    words = result[key]['name'] + ' ' + result[key]['org']
    file_name = result[key]['search_results_page'].split('/')[-1]
    scrap_from_google(words, file_name)

In [139]:
def extract_url(file_name):
    """
    将 google 搜索结果页的所有 url 解析出来
    :param file_name:
    :return:
    """
    with open('results/'+file_name) as f:
        result_page = BeautifulSoup(''.join(f.readlines()), 'lxml')
        results = result_page.find('div',{'id':'ires'})
        for url in results.find_all('a'):
            print(url.get('href'))
            print('-----')

In [140]:
for key in result:
    file_name = result[key]['search_results_page'].split('/')[-1]
    extract_url(file_name)


/url?q=http://mgv.pku.edu.cn/%3Fcatalog%3Denpiintro%26pname%3DChen_Zhang&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggTMAA&usg=AFQjCNE7gGwLNpzPUCQ7nlZu52z8Twzc1g
-----
/url?q=http://webcache.googleusercontent.com/search%3Fq%3Dcache:8y6bZdlEK_cJ:http://mgv.pku.edu.cn/%3Fcatalog%253Denpiintro%2526pname%253DChen_Zhang%252BChen%2BZhang%2BPeking%2BUniversity(Peking%2BUniversity),Beijing,China%26newwindow%3D1%26hl%3Dzh-TW%26ct%3Dclnk&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQIAgWMAA&usg=AFQjCNFER_skpHEsZNpOkMnGoGCqmeC8WQ
-----
/search?newwindow=1&ie=UTF-8&q=related:mgv.pku.edu.cn/%3Fcatalog%3Denpiintro%26pname%3DChen_Zhang+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwgXMAA
-----
/url?q=https://www.researchgate.net/profile/Chen_Zhang135&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggZMAE&usg=AFQjCNEke6qWGbWhIMvhNlgQNrmEEJa7Ww
-----
/url?q=https://www.researchgate.net/profile/Chen_Zhang82&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggbMAI&usg=AFQjCNFBtMmT4aDlq75VBc7jxlAxDdlBeQ
-----
/url?q=https://www.researchgate.net/profile/Qing_Chen9&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggdMAM&usg=AFQjCNFQxOvpQNmQBbl6vzf-mYofiY6meQ
-----
/search?newwindow=1&ie=UTF-8&q=related:https://www.researchgate.net/profile/Qing_Chen9+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwggMAM
-----
/url?q=https://www.facebook.com/public/Chen-Zhang/school/Peking-University-103771729661827/&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggiMAQ&usg=AFQjCNGb6PwZVjG5FTWtE8JyjNpE4n9WKA
-----
/url?q=http://webcache.googleusercontent.com/search%3Fq%3Dcache:qf-l6EjuT5UJ:https://www.facebook.com/public/Chen-Zhang/school/Peking-University-103771729661827/%252BChen%2BZhang%2BPeking%2BUniversity(Peking%2BUniversity),Beijing,China%26newwindow%3D1%26hl%3Dzh-TW%26ct%3Dclnk&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQIAglMAQ&usg=AFQjCNEiSdPpaFTMoqyS50ocEzLSJdI_wQ
-----
/url?q=https://www.linkedin.com/in/en00007128&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggnMAU&usg=AFQjCNH73Qu1UsQfn4eBznzkjGnJU17VLQ
-----
/search?newwindow=1&ie=UTF-8&q=related:https://www.linkedin.com/in/en00007128+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwgqMAU
-----
/url?q=http://www.cms.zju.edu.cn/conference/YCMC/PRIZES.html&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggsMAY&usg=AFQjCNGmJmQUBcW9P-qRJZHolpRNq3Ke0g
-----
/url?q=http://webcache.googleusercontent.com/search%3Fq%3Dcache:DlVK8vY75xwJ:http://www.cms.zju.edu.cn/conference/YCMC/PRIZES.html%252BChen%2BZhang%2BPeking%2BUniversity(Peking%2BUniversity),Beijing,China%26newwindow%3D1%26hl%3Dzh-TW%26ct%3Dclnk&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQIAgvMAY&usg=AFQjCNERHDytJq9iZDgZzS7kxlMP8ge-Ww
-----
/search?newwindow=1&ie=UTF-8&q=related:www.cms.zju.edu.cn/conference/YCMC/PRIZES.html+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwgwMAY
-----
/url?q=https://en.wikipedia.org/wiki/Peking_University&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFggyMAc&usg=AFQjCNFXS_FzK2CKtqYFJozJ8GdcqCQHFA
-----
/url?q=http://webcache.googleusercontent.com/search%3Fq%3Dcache:OKz-ybZdVSoJ:https://en.wikipedia.org/wiki/Peking_University%252BChen%2BZhang%2BPeking%2BUniversity(Peking%2BUniversity),Beijing,China%26newwindow%3D1%26hl%3Dzh-TW%26ct%3Dclnk&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQIAg1MAc&usg=AFQjCNFgoCo--IuvFNxbazkTJSCk5WWfhg
-----
/search?newwindow=1&ie=UTF-8&q=related:https://en.wikipedia.org/wiki/Peking_University+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwg2MAc
-----
/url?q=https://zfin.org/ZDB-LAB-110923-2&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFgg4MAg&usg=AFQjCNE82q4VOMIomw5KzybQZE_4w04VIg
-----
/url?q=http://webcache.googleusercontent.com/search%3Fq%3Dcache:iF1NBITQu3gJ:https://zfin.org/ZDB-LAB-110923-2%252BChen%2BZhang%2BPeking%2BUniversity(Peking%2BUniversity),Beijing,China%26newwindow%3D1%26hl%3Dzh-TW%26ct%3Dclnk&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQIAg7MAg&usg=AFQjCNGR52BFvP0yheSqZwRvx5W4Tf6S3A
-----
/search?newwindow=1&ie=UTF-8&q=related:https://zfin.org/ZDB-LAB-110923-2+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwg8MAg
-----
/url?q=http://pkuasc.fasic.org.au/scholars/&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQFgg-MAk&usg=AFQjCNHVXBDyciOBvfFpFFqxQ48IhOVJFA
-----
/url?q=http://webcache.googleusercontent.com/search%3Fq%3Dcache:dz5PP7qKTkkJ:http://pkuasc.fasic.org.au/scholars/%252BChen%2BZhang%2BPeking%2BUniversity(Peking%2BUniversity),Beijing,China%26newwindow%3D1%26hl%3Dzh-TW%26ct%3Dclnk&sa=U&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQIAhBMAk&usg=AFQjCNFadhFSFmcvJEzzBde32qwTSHyERw
-----
/search?newwindow=1&ie=UTF-8&q=related:pkuasc.fasic.org.au/scholars/+Chen+Zhang+Peking+University(Peking+University),Beijing,China&tbo=1&sa=X&ved=0ahUKEwi6sMyoxoPWAhUBy7wKHUwqC8cQHwhCMAk
-----

In [ ]: