In [1]:
from bs4 import BeautifulSoup
import urllib2
import json
import pandas as pd
import re
import pythonwhois
from time import sleep
In [2]:
name_stub = 'farm'
domain_list = ['.com', '.org', '.net', '.io', '.co', '.iq', '.it', '.boss']
In [4]:
contains_letters = ''
no_letters = '4'
url = 'http://itools.subhashbose.com/wordfind/containing/'+contains_letters+'/no-of-letters_equal-to_'+no_letters
try:
content = urllib2.urlopen(url).read()
except urllib2.URLError, e:
print('SoupError', e)
soup = BeautifulSoup(content)
results = soup.find(id='result')
word_list = [text for text in results.stripped_strings]
# Delete first entry in list
word_list.pop(0)
# Add additional words
word_list.insert(0, u'robot')
word_list.insert(0, u'kind')
# Add name_stub
word_list = [name_stub+word for word in word_list]
# Add other names
#german_words = ['datageist', 'datagarden', 'delicadata', 'datastrudel', 'databahn', 'datakraut', 'datawunder', 'dataphil', 'datahunt', 'databeat', 'dataoo']
#word_list = word_list + german_words
#word_list = ['datalook']
In [37]:
# Create DataFrame
df = pd.DataFrame(index = [x.upper() for x in word_list], columns= ['US-Trademark', '.com', '.org', '.net', '.de', '.co', '.io', 'Twitter', 'Google'])
# Add sentence to get a feeling for the name
df['Example'] = df.index + ' is a site for sharing data-driven projects for social good.'
df.head()
Out[37]:
In [38]:
#https://api.twitter.com/1.1/users/lookup.json?screen_name='+name
#word_list2 = word_list[75:]
for word in word_list:
print word.upper()
# Twitter
twitter_url = 'https://twitter.com/users/username_available?username='+word
try:
twitter = json.loads(urllib2.urlopen(twitter_url).read())
if twitter['reason'] == 'available':
df['Twitter'][word.upper()] = 'X'
else:
df['Twitter'][word.upper()] = ''
except urllib2.URLError, e:
print('TwitterError', e)
# US-Trademark
tm_url = 'http://www.markerapi.com/api/v1/trademark/search/'+word+'/username/dataforgood/password/Cmk6P2ZQXN'
try:
trademark = json.loads(urllib2.urlopen(tm_url).read())
if trademark['count'] == 0:
df['US-Trademark'][word.upper()] = 'X'
else:
df['US-Trademark'][word.upper()] = str(trademark['count'])
except urllib2.URLError, e:
print('TrademarkError', e)
# Domains
for tld in domain_list:
try:
domain_check = pythonwhois.get_whois(word+tld)
except Exception, e:
import traceback
print traceback.format_exc()
pass
print tld
#print domain_check['contacts']['admin']
if domain_check['contacts']['admin'] == None:
df[tld][word.upper()] = 'X'
else:
df[tld][word.upper()] = ''
# Time delay necessary for whois server - otherwise blocked
sleep(5)
# Google results
url = 'http://www.google.com/search?hl=en&q='+word
hdr = {'User-Agent': 'Mozilla/5.0'}
try:
req = urllib2.Request(url,headers=hdr)
content = urllib2.urlopen(req).read()
except urllib2.URLError, e:
print('SoupError', e)
soup = BeautifulSoup(content)
#no_results = re.findall(r'About (.+?) results', soup.get_text())
no_results = str(soup.find(id='resultStats').get_text()).replace(',','')
df['Google'][word.upper()] = int(re.findall(r'\d+', no_results)[0])
print re.findall(r'\d+', no_results)[0]
In [39]:
df.head(20)
Out[39]:
In [40]:
#writer = pd.ExcelWriter('Trademark_Domain_Script_company_names.xlsx')
#df.to_excel(writer)
#writer.save()