notebook.community

Edit and run



In [8]:

    
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_1.csv', sep=',')
#df_test = df_urls.iloc[0:10, :]
#df_test = df_urls.loc[[536,115], :]
df_test = df_urls.head(n=2)

df_test
def getEmail(x):
    page = requests.get(str(x))
    soup = BeautifulSoup(page.content, 'html.parser')
    emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
    #mailtos = soup.select('a[href^=mailto]')
    #return soup
    EmailSearch = [emails,soup]
    return EmailSearch
    
    
for index, row in df_test.iterrows():
    x = row['URLS']
    EmailSearch2 = getEmail(x)
    emails = EmailSearch2[0]
    soup = EmailSearch2[1]
    print (emails)
    #if emails==set():
        #print("EMAIL EQUALS SET")
        #for elem in soup(text=re.compile(r'Contact')):
            #parent = (elem.parent)
            #y = parent.get('href')
            #print(y)
            #EmailSearch3 = getEmail(y)
            #emails = EmailSearch3[0]
            
    #df_test.set_value(index, 'EMAIL', emails)
    #df_test.set_value(index, 'MAILTOS', mailtos)

df_test
    



#for index, row in df_test.iterrows():
    #x = row['URLS']
    #getEmail(x)
    #if emails == "" and mailtos =="":
    #this seems to crash when letting more than a few rows into the df

    #print (emails)
    #print (index)
    #print (row)
    #df_test.set_value(index, 'EMAIL', emails)
    #df_test.set_value(index, 'MAILTOS', mailtos)
    #How to write the email list values into multiple new columns next to the URL?
    #if df_test['EMAIL'] == " ":
        #df_test.set_value(index, 'EMAIL', 'NO EMAIL')
    
    #for elem in soup(text=re.compile(r'Contact')):
        #parent = (elem.parent)
        #x = parent.get('href')
        
    
    
    
    #for link in soup.find_all('a'):
    #for link in item.find_all('a'):
        #FIND A WAY TO ONLY DO 'a' where the text matches "contact" or "about us" or "contact" etc.
        #print (link.get('href'))
        #pass that href back into the email scraper
        #repeat the analysis
        
        #http://stackoverflow.com/a/14470573

#page = requests.get('https://northstarfund.org/')
#soup = BeautifulSoup(page.content, 'html.parser')
#print (soup)        
        
#page = requests.get(str(row['URLS']))
    
    
#df_test









    



{'duspapply@mit.edu', 'duspinfo@mit.edu'}






    



---------------------------------------------------------------------------
ConnectionResetError                      Traceback (most recent call last)
/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    593                                                   body=body, headers=headers,
--> 594                                                   chunked=chunked)
    595 

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    390                     # otherwise it looks like a programming error was the cause.
--> 391                     six.raise_from(e, None)
    392         except (SocketTimeout, BaseSSLError, SocketError) as e:

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/packages/six.py in raise_from(value, from_value)

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    386                 try:
--> 387                     httplib_response = conn.getresponse()
    388                 except Exception as e:

/Users/zachpostone/anaconda/lib/python3.5/http/client.py in getresponse(self)
   1173             try:
-> 1174                 response.begin()
   1175             except ConnectionError:

/Users/zachpostone/anaconda/lib/python3.5/http/client.py in begin(self)
    281         while True:
--> 282             version, status, reason = self._read_status()
    283             if status != CONTINUE:

/Users/zachpostone/anaconda/lib/python3.5/http/client.py in _read_status(self)
    242     def _read_status(self):
--> 243         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    244         if len(line) > _MAXLINE:

/Users/zachpostone/anaconda/lib/python3.5/socket.py in readinto(self, b)
    574             try:
--> 575                 return self._sock.recv_into(b)
    576             except timeout:

ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

ProtocolError                             Traceback (most recent call last)
/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    422                     retries=self.max_retries,
--> 423                     timeout=timeout
    424                 )

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    642             retries = retries.increment(method, url, error=e, _pool=self,
--> 643                                         _stacktrace=sys.exc_info()[2])
    644             retries.sleep()

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    333             if read is False:
--> 334                 raise six.reraise(type(error), error, _stacktrace)
    335             elif read is not None:

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/packages/six.py in reraise(tp, value, tb)
    684         if value.__traceback__ is not tb:
--> 685             raise value.with_traceback(tb)
    686         raise value

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    593                                                   body=body, headers=headers,
--> 594                                                   chunked=chunked)
    595 

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    390                     # otherwise it looks like a programming error was the cause.
--> 391                     six.raise_from(e, None)
    392         except (SocketTimeout, BaseSSLError, SocketError) as e:

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/packages/six.py in raise_from(value, from_value)

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    386                 try:
--> 387                     httplib_response = conn.getresponse()
    388                 except Exception as e:

/Users/zachpostone/anaconda/lib/python3.5/http/client.py in getresponse(self)
   1173             try:
-> 1174                 response.begin()
   1175             except ConnectionError:

/Users/zachpostone/anaconda/lib/python3.5/http/client.py in begin(self)
    281         while True:
--> 282             version, status, reason = self._read_status()
    283             if status != CONTINUE:

/Users/zachpostone/anaconda/lib/python3.5/http/client.py in _read_status(self)
    242     def _read_status(self):
--> 243         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    244         if len(line) > _MAXLINE:

/Users/zachpostone/anaconda/lib/python3.5/socket.py in readinto(self, b)
    574             try:
--> 575                 return self._sock.recv_into(b)
    576             except timeout:

ProtocolError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
<ipython-input-8-73005a3c3282> in <module>()
     22 for index, row in df_test.iterrows():
     23     x = row['URLS']
---> 24     EmailSearch2 = getEmail(x)
     25     emails = EmailSearch2[0]
     26     soup = EmailSearch2[1]

<ipython-input-8-73005a3c3282> in getEmail(x)
     11 df_test
     12 def getEmail(x):
---> 13     page = requests.get(str(x))
     14     soup = BeautifulSoup(page.content, 'html.parser')
     15     emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/api.py in get(url, params, **kwargs)
     68 
     69     kwargs.setdefault('allow_redirects', True)
---> 70     return request('get', url, params=params, **kwargs)
     71 
     72 

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/api.py in request(method, url, **kwargs)
     54     # cases, and look like a memory leak in others.
     55     with sessions.Session() as session:
---> 56         return session.request(method=method, url=url, **kwargs)
     57 
     58 

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    486         }
    487         send_kwargs.update(settings)
--> 488         resp = self.send(prep, **send_kwargs)
    489 
    490         return resp

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/sessions.py in send(self, request, **kwargs)
    607 
    608         # Send the request
--> 609         r = adapter.send(request, **kwargs)
    610 
    611         # Total elapsed time of the request (approximately)

/Users/zachpostone/anaconda/lib/python3.5/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    471 
    472         except (ProtocolError, socket.error) as err:
--> 473             raise ConnectionError(err, request=request)
    474 
    475         except MaxRetryError as e:

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))



In [ ]:

    
'''
# WORKING
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os, json
from bs4 import BeautifulSoup
#import lxml

df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_1.csv', sep=',')

#for row in df_urls:
#make this iterate through the rows in df_urls and append email addresses in an adjacent column. If multiple email addresses on page, add multiple columns?

page = requests.get('http://dusp.mit.edu/department/contact-directions')
tree = html.fromstring(page.text)
soup = BeautifulSoup(page.content, 'html.parser')

#mailtos = soup.select('a[href^=mailto]')

# a set of crawled emails
emails = set()
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
#how to include mailto links (with non-email text) as another thing to capture?
#print (mailtos)
#print (other)

print (new_emails)

#soup.body.findAll(text=re.compile('^@$'))
#for elem in soup(text=re.compile(r'@')):
    #print (elem.parent)
    
#http://stackoverflow.com/a/866050    
    
#soup = BeautifulSoup(page.content, 'html.parser')
#para = soup.find_all('p')
#print (para)
#print(soup.prettify())

#append the results from emails to new columns


#print (tree)
#print (buyers)
#for i in tree:
    #emails = re.search("a", i)
    #print (emails)

#http://stackoverflow.com/a/3655588

#http://scraping.pro/simple-email-crawler-python/
'''



In [2]:

    
from bs4 import BeautifulSoup
import requests
import requests.exceptions
import urllib
import urllib.parse
from collections import deque
import re
import sys
import pandas as pd

df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_5.csv', sep=',')
df_test = df_urls.head(n=3)


for index, row in df_test.iterrows():
    inputurl = (row['Links'])
    new_urls = deque([inputurl])
    # a set of urls that we have already crawled
    processed_urls = set()
    # a set of crawled emails
    emails = set()
    # process urls one by one until we exhaust the queue
    while len(new_urls):
        # move next url from the queue to the set of processed urls
        url = new_urls.popleft()
        processed_urls.add(url)
        # extract base url to resolve relative links
        parts = urllib.parse.urlsplit(url)
        base_url = "{0.scheme}://{0.netloc}".format(parts)
        path = url[:url.rfind('/')+1] if '/' in parts.path else url

        # path is full path... http://...

        # get url's content
        #print("Processing %s" % url)
        try:
            response = requests.get(url)
        except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
            # ignore pages with errors
            continue

        # extract all email addresses and add them into the resulting set
        new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
        emails.update(new_emails)

        # create a beautiful soup for the html document
        soup = BeautifulSoup(response.text, 'html.parser')

        # find and process all the anchors in the document
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            # resolve relative links
            if link.startswith('mailto'):
                emails.update(link)
                break
            elif link.startswith('/'):
                link = base_url + link
            elif not link.startswith(inputurl):
                break
            # add the new url to the queue if it was not enqueued nor processed yet
            if not link in new_urls and not link in processed_urls:
                new_urls.append(link)
        emails = str(new_emails)
        print (emails)
        df_test.set_value(index, 'Email', emails)
        #df_test.set_value(index, 'MAILTOS', mailtos)   
        #print new_urls

df_test

#print new_emails









    



---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-2-8be05621a35b> in <module>()
     10 
     11 df_urls = pd.DataFrame()
---> 12 df_urls = pd.read_table('urls_5.csv', sep=',')
     13 df_test = df_urls.head(n=3)
     14 

/Users/zachpostone/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    560                     skip_blank_lines=skip_blank_lines)
    561 
--> 562         return _read(filepath_or_buffer, kwds)
    563 
    564     parser_f.__name__ = name

/Users/zachpostone/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    323         return parser
    324 
--> 325     return parser.read()
    326 
    327 _parser_defaults = {

/Users/zachpostone/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
    813                 raise ValueError('skip_footer not supported for iteration')
    814 
--> 815         ret = self._engine.read(nrows)
    816 
    817         if self.options.get('as_recarray'):

/Users/zachpostone/anaconda/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
   1312     def read(self, nrows=None):
   1313         try:
-> 1314             data = self._reader.read(nrows)
   1315         except StopIteration:
   1316             if self._first_chunk:

pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:8748)()

pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:9003)()

pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:10022)()

pandas/parser.pyx in pandas.parser.TextReader._convert_column_data (pandas/parser.c:11397)()

pandas/parser.pyx in pandas.parser.TextReader._convert_tokens (pandas/parser.c:12302)()

pandas/parser.pyx in pandas.parser.TextReader._convert_with_dtype (pandas/parser.c:13740)()

pandas/parser.pyx in pandas.parser.TextReader._string_convert (pandas/parser.c:13983)()

pandas/parser.pyx in pandas.parser._string_box_utf8 (pandas/parser.c:19298)()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 25: invalid continuation byte



In [ ]: