In [1]:
from bs4 import BeautifulSoup
import urllib2
import json
import pandas as pd
import re
import pythonwhois
from time import sleep

In [2]:
name_stub = 'farm'
domain_list = ['.com', '.org', '.net', '.io', '.co', '.iq', '.it', '.boss']

In [4]:
contains_letters = ''
no_letters = '4'
url = 'http://itools.subhashbose.com/wordfind/containing/'+contains_letters+'/no-of-letters_equal-to_'+no_letters
try:
    content = urllib2.urlopen(url).read()
except urllib2.URLError, e:
        print('SoupError', e)
soup = BeautifulSoup(content)

results = soup.find(id='result')
word_list = [text for text in results.stripped_strings]

# Delete first entry in list
word_list.pop(0)

# Add additional words
word_list.insert(0, u'robot')
word_list.insert(0, u'kind')

# Add name_stub
word_list = [name_stub+word for word in word_list]

# Add other names
#german_words = ['datageist', 'datagarden', 'delicadata', 'datastrudel', 'databahn', 'datakraut', 'datawunder', 'dataphil', 'datahunt', 'databeat', 'dataoo']
#word_list = word_list + german_words
#word_list = ['datalook']

In [37]:
# Create DataFrame
df = pd.DataFrame(index = [x.upper() for x in word_list], columns= ['US-Trademark', '.com', '.org', '.net', '.de', '.co', '.io', 'Twitter', 'Google'])

# Add sentence to get a feeling for the name
df['Example'] = df.index + ' is a site for sharing data-driven projects for social good.'
df.head()


Out[37]:
US-Trademark .com .org .net .de .co .io Twitter Google Example
DATALOOK NaN NaN NaN NaN NaN NaN NaN NaN NaN DATALOOK is a site for sharing data-driven pro...

In [38]:
#https://api.twitter.com/1.1/users/lookup.json?screen_name='+name
#word_list2 = word_list[75:]
for word in word_list:
    
    print word.upper()
    # Twitter
    twitter_url = 'https://twitter.com/users/username_available?username='+word
    try:
        twitter = json.loads(urllib2.urlopen(twitter_url).read())
        if twitter['reason'] == 'available':
            df['Twitter'][word.upper()] = 'X'
        else:
            df['Twitter'][word.upper()] = ''
        
    except urllib2.URLError, e:
        print('TwitterError', e)
        
    # US-Trademark
    tm_url = 'http://www.markerapi.com/api/v1/trademark/search/'+word+'/username/dataforgood/password/Cmk6P2ZQXN'
    try:
        trademark = json.loads(urllib2.urlopen(tm_url).read())
        
        if trademark['count'] == 0:
            df['US-Trademark'][word.upper()] = 'X'
        else:
            df['US-Trademark'][word.upper()] = str(trademark['count'])
            
    except urllib2.URLError, e:
        print('TrademarkError', e)
        
    # Domains    
    for tld in domain_list:
        try:
            domain_check = pythonwhois.get_whois(word+tld)
        except Exception, e:
            import traceback
            print traceback.format_exc()
            pass
        print tld
        #print domain_check['contacts']['admin']
        
        if domain_check['contacts']['admin'] == None:
            df[tld][word.upper()] = 'X'
        else: 
            df[tld][word.upper()] = ''
        # Time delay necessary for whois server - otherwise blocked
        sleep(5)
        
    # Google results
    url = 'http://www.google.com/search?hl=en&q='+word
    hdr = {'User-Agent': 'Mozilla/5.0'}
    try:
        req = urllib2.Request(url,headers=hdr)
        content = urllib2.urlopen(req).read()
    except urllib2.URLError, e:
        print('SoupError', e)

    soup = BeautifulSoup(content)
    #no_results = re.findall(r'About (.+?) results', soup.get_text())
    no_results = str(soup.find(id='resultStats').get_text()).replace(',','')
    df['Google'][word.upper()] = int(re.findall(r'\d+', no_results)[0])
    print re.findall(r'\d+', no_results)[0]


DATALOOK
.com
.org
.net
.de
.co
.io
395000

In [39]:
df.head(20)


Out[39]:
US-Trademark .com .org .net .de .co .io Twitter Google Example
DATALOOK X X X X X 395000 DATALOOK is a site for sharing data-driven pro...

In [40]:
#writer = pd.ExcelWriter('Trademark_Domain_Script_company_names.xlsx')
#df.to_excel(writer)
#writer.save()