In [1]:
import MySQLdb
In [10]:
f = open('Reviews_Data/are_you_scammers.txt', 'w')
In [11]:
def Jaccard (user_one, user_two, tablename, cursor):
intersect = 0
user_one = '"' + user_one + '"'
user_two = '"' + user_two + '"'
#get product ids of products a user reviewed
#note when/if i remake data tables, this is how to get rid of duplicates:
cursor.execute("SELECT distinct PID FROM "+tablename+" WHERE RUserID = " + user_one + " GROUP BY RText;")
prods_one = cursor.fetchall()
cursor.execute("SELECT distinct PID FROM "+ tablename+" WHERE RUserID = " + user_two + " Group BY RText;")
prods_two = cursor.fetchall()
#print prods_two
#check all their products reviewed, pairwise, to see if they reviewed the same stuff.
if min(len(prods_one), len(prods_two))>5: #need to have both reviewed at least 5 distinct things
for i in range(len(prods_one)):
for j in range(len(prods_two)):
if prods_one[i] == prods_two[j]:
intersect += 1
#print intersect
#print len(prods_one) + len(prods_two)
return float(intersect)/float((len(prods_one) + len(prods_two)))
In [12]:
db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")
In [13]:
tablename = 'all_hk'
In [14]:
cursor = db.cursor()
In [15]:
cursor.execute("SELECT DISTINCT PTitle, PID, COUNT(PTitle) AS CNT FROM "+tablename+" GROUP BY PTitle HAVING CNT< 100")
Out[15]:
In [16]:
prod_list = cursor.fetchall()
prod_list = tuple(x[0] for x in prod_list)
prod_list = [x.replace('"', "'") for x in prod_list]
max_jac = 0
In [17]:
for k in range(len(prod_list)):
#if k % 20 == 0:
#print "I'm doing something! k is " + str(k)
cursor.execute("SELECT RuserID FROM "+tablename+" WHERE PTitle = " +'"'+ prod_list[k] +'"'+ ";")
user_list = cursor.fetchall()
user_list = tuple(x[0] for x in user_list)
#print "product : ", prod_list[k], "users: ", user_list, "\n"
if 1<len(user_list)<25: #just to make the computation not super-long.
for i in range(len(user_list)):
for j in range(i+1, len(user_list)):
if(user_list[i]!=user_list[j]):
jac = Jaccard(user_list[i], user_list[j], tablename, cursor)
#print "Jaccard of ", user_list[i], user_list[j], " is: ", jac, " and max_jac is currently ", max_jac
if jac > min(max_jac, .1):
outputstring = "Jaccard of " + str(user_list[i]) +" " + str(user_list[j]) + " is: " + str(jac)+ " and product is " + str(prod_list[k])+ "\n"
f.write(outputstring)
cursor.execute("SELECT distinct PID, Ptitle FROM "+tablename+" WHERE RUserID = " + '"' + str(user_list[i]) + '"' + " Group BY RText;")
prods_one = cursor.fetchall()
cursor.execute("SELECT distinct PID, PTitle FROM "+tablename+" WHERE RUserID = " + '"' + str(user_list[j]) + '"' + " Group BY RText;")
prods_two = cursor.fetchall()
prodstring = "products : "+ str(prods_one)+ " \n " + str(prods_two)+ "\n"
if jac > max_jac:
max_jac = jac
print "Biggest jaccard similarity so far is: ", jac, " on users ", user_list[i], " ", user_list[j], "\n"
print prodstring
In [122]:
db.close()
In [ ]: