In [1]:
import MySQLdb

In [10]:
f = open('Reviews_Data/are_you_scammers.txt', 'w')

In [11]:
def Jaccard (user_one, user_two, tablename, cursor):
    intersect = 0
    user_one = '"' + user_one + '"'
    user_two = '"' + user_two + '"'
    
    #get product ids of products a user reviewed
    #note when/if i remake data tables, this is how to get rid of duplicates:
    cursor.execute("SELECT distinct PID FROM "+tablename+" WHERE RUserID = " + user_one  + " GROUP BY RText;")
    prods_one = cursor.fetchall()
    
    
    cursor.execute("SELECT distinct PID FROM "+ tablename+" WHERE RUserID = " + user_two + " Group BY RText;")
    prods_two = cursor.fetchall()
    #print prods_two
    
    #check all their products reviewed, pairwise, to see if they reviewed the same stuff. 
    if min(len(prods_one), len(prods_two))>5:  #need to have both reviewed at least 5 distinct things
        for i in range(len(prods_one)):
            for j in range(len(prods_two)):
                if prods_one[i] == prods_two[j]:
                    intersect += 1
            
    #print intersect
    #print len(prods_one) + len(prods_two)
    return float(intersect)/float((len(prods_one) + len(prods_two)))

In [12]:
db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")

In [13]:
tablename = 'all_hk'

In [14]:
cursor = db.cursor()

In [15]:
cursor.execute("SELECT DISTINCT PTitle, PID, COUNT(PTitle) AS CNT FROM "+tablename+" GROUP BY PTitle HAVING CNT< 100")


Out[15]:
71807L

In [16]:
prod_list = cursor.fetchall()
prod_list = tuple(x[0] for x in prod_list)
prod_list = [x.replace('"', "'") for x in prod_list]
max_jac = 0

In [17]:
for k in range(len(prod_list)):
    #if k % 20 == 0:
        #print "I'm doing something! k is " + str(k)
    cursor.execute("SELECT RuserID FROM "+tablename+" WHERE PTitle = " +'"'+ prod_list[k] +'"'+ ";")
    user_list = cursor.fetchall()
    user_list = tuple(x[0] for x in user_list)
    #print "product : ", prod_list[k], "users: ", user_list, "\n"
    if 1<len(user_list)<25: #just to make the computation not super-long. 
        for i in range(len(user_list)):
            for j in range(i+1, len(user_list)):
                if(user_list[i]!=user_list[j]):
                    jac = Jaccard(user_list[i], user_list[j], tablename, cursor)
                    #print "Jaccard of ", user_list[i], user_list[j], " is: ", jac, " and max_jac is currently ", max_jac
                    if jac > min(max_jac, .1):
                        outputstring = "Jaccard of " + str(user_list[i]) +" " +  str(user_list[j]) + " is: " + str(jac)+ " and product is " + str(prod_list[k])+ "\n"    
                        f.write(outputstring)
                        cursor.execute("SELECT distinct PID, Ptitle FROM "+tablename+" WHERE RUserID = " + '"' + str(user_list[i]) + '"'  + " Group BY RText;")
                        prods_one = cursor.fetchall()
                        cursor.execute("SELECT distinct PID, PTitle FROM "+tablename+" WHERE RUserID = " + '"' + str(user_list[j]) + '"'  + " Group BY RText;")
                        prods_two = cursor.fetchall()
                        prodstring =  "products : "+ str(prods_one)+ " \n " + str(prods_two)+ "\n"
                        if jac > max_jac:
                            max_jac = jac
                            print "Biggest jaccard similarity so far is: ", jac, " on users ", user_list[i], " ", user_list[j], "\n"
                            print prodstring


Exception _mysql_exceptions.ProgrammingError: (2014, "Commands out of sync; you can't run this command now") in <bound method Cursor.__del__ of <MySQLdb.cursors.Cursor object at 0x100efe310>> ignored
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-17-06fa730f6d75> in <module>()
     10             for j in range(i+1, len(user_list)):
     11                 if(user_list[i]!=user_list[j]):
---> 12                     jac = Jaccard(user_list[i], user_list[j], tablename, cursor)
     13                     #print "Jaccard of ", user_list[i], user_list[j], " is: ", jac, " and max_jac is currently ", max_jac
     14                     if jac > min(max_jac, .1):

<ipython-input-11-abfea1033b09> in Jaccard(user_one, user_two, tablename, cursor)
      6     #get product ids of products a user reviewed
      7     #note when/if i remake data tables, this is how to get rid of duplicates:
----> 8     cursor.execute("SELECT distinct PID FROM "+tablename+" WHERE RUserID = " + user_one  + " GROUP BY RText;")
      9     prods_one = cursor.fetchall()
     10 

/Users/kelty/Desktop/Amazon-Review-Project/venv/lib/python2.7/site-packages/MySQLdb/cursors.pyc in execute(self, query, args)
    188         try:
    189             r = None
--> 190             r = self._query(query)
    191         except TypeError, m:
    192             if m.args[0] in ("not enough arguments for format string",

/Users/kelty/Desktop/Amazon-Review-Project/venv/lib/python2.7/site-packages/MySQLdb/cursors.pyc in _query(self, q)
    352 
    353     def _query(self, q):
--> 354         rowcount = self._do_query(q)
    355         self._post_get_result()
    356         return rowcount

/Users/kelty/Desktop/Amazon-Review-Project/venv/lib/python2.7/site-packages/MySQLdb/cursors.pyc in _do_query(self, q)
    316         db = self._get_db()
    317         self._last_executed = q
--> 318         db.query(q)
    319         self._do_get_result()
    320         return self.rowcount

KeyboardInterrupt: 

In [122]:
db.close()

In [ ]: