notebook.community

Edit and run



In [ ]:

    
from collections import namedtuple
rawEventsRdd = sc.textFile("/home/mert/yahoo/events.txt")
EventDataRow = namedtuple("EventDataRow", ["userId", "itemId", "ts", "latitude", "longitude", "city", "day_of_week", "time_of_day" , "event_type"])
def parseRawData(line):
    lineSplit = line.split("\t")
    return EventDataRow(userId=lineSplit[0],
                      itemId=lineSplit[1],
                      ts=int(lineSplit[2]),
                      latitude=float(lineSplit[3]),
                      longitude=float(lineSplit[4]),
                      city=lineSplit[5],
                      day_of_week=int(lineSplit[6]),
                      time_of_day=int(lineSplit[7]),
                      event_type=lineSplit[-1],
    )
#eventsRdd = sc.parallelize(rawEventsRdd.map(parseRawData).take(10000000))
eventsRdd = rawEventsRdd.map(parseRawData).cache()
userIdConversionDictionary = eventsRdd.map(lambda x: x.userId).distinct().zipWithIndex().collectAsMap()
userIdConversionDictionaryBroadcast = sc.broadcast(userIdConversionDictionary)
itemIdConversionDictionary = eventsRdd.map(lambda x: x.itemId).distinct().zipWithIndex().collectAsMap()
itemIdConversionDictionaryBroadcast = sc.broadcast(itemIdConversionDictionary)
cityConversionDictionary = eventsRdd.map(lambda x: x.city).distinct().zipWithIndex().collectAsMap()
cityConversionDictionaryBroadcast = sc.broadcast(cityConversionDictionary)

eventsConvertedRdd = eventsRdd.map(lambda x: EventDataRow(
    userId=userIdConversionDictionaryBroadcast.value[x.userId],
    itemId=itemIdConversionDictionaryBroadcast.value[x.itemId],
    ts=x.ts,
    latitude=x.latitude,
    longitude=x.longitude,
    city=cityConversionDictionaryBroadcast.value[x.city],
    day_of_week=x.day_of_week,
    time_of_day=x.time_of_day,
    event_type=x.event_type
    ))
eventsConvertedRdd.take(2)



In [ ]:

    
finalRDD = eventsConvertedRdd.map(lambda x: [
    x.userId,(
    x.itemId,
    x.ts,
    x.latitude,
    x.longitude,)
    ])
finalRDD.take(3)
#groupData = map((lambda (x,y): (x, list(y))), sorted(finalRDD.groupByKey().collect()))
#groupData = map((lambda (x,y): (x, sorted(list(y),key=lambda a: a[1]))), sorted(finalRDD.groupByKey()))
groupData = finalRDD.groupByKey().map(lambda (x,y): (x, sorted(list(y),key=lambda a: a[1])))
#groupData = sc.parallelize(groupData.take(2000))



In [ ]:

    
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r
def detectMovement(x):
    data = x[1]
    newData = [(data[0][0], data[0][1], data[0][2], data[0][3], 1)]
    for i in xrange(1,len(data)):
        event = data[i]
        distance = haversine(event[3],event[2], data[i-1][3], data[i-1][2]) * 1000 #in meters
        time_difference = event[1] - newData[i-1][1] #in seconds
        moving = 1 #not available 
        if time_difference <= 300: #if 2 consecutive events are more than 300 seconds away, the movement is not available
            velocity =  distance/time_difference if time_difference > 0 else -1
            if velocity < 0:
                moving = 1; #not available
            elif velocity >= 0 and velocity <= 1:
                moving = 2  #standing still
            elif velocity <=2.4:
                moving = 3 #walking spead
            else:
                moving = 4 #faster
        newData.append((event[0],event[1],event[2],event[3], moving))
    return (x[0], newData)
    #return x
#print haversine(elem[0][1][2][1],elem[0][1][1][1],elem[6][1][2][1],elem[6][1][1][1])
groupData = groupData.map(detectMovement).cache()

#groupData.take(1)



In [ ]:

    
import datetime
from math import radians, cos, sin, asin, sqrt
from collections import Counter
def remove_duplicates(values):
    output = []
    seen = set()
    for value in values:
        # If value has not been encountered yet,
        # ... add it to both list and set.
        if value not in seen:
            output.append(value)
            seen.add(value)
    return output
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def convertLocation(line):
    listGroup = line[1]
    workGroup = [x for x in listGroup if datetime.datetime.fromtimestamp(int(x[1])).hour >= 6 and  
            datetime.datetime.fromtimestamp(int(x[1])).hour <= 18]
    numNearLocation = []
    i = 0
    for x in workGroup:
        numNearLocation.append(0);
        for y in workGroup:
            if haversine(x[3], x[2], y[3], y[2]) < 0.1:
                numNearLocation[i] = numNearLocation[i] + 1
        i = i + 1
    if len(numNearLocation) > 0:
        index_work = numNearLocation.index(max(numNearLocation))
    else:
        index_work = -1
#     workGroup = [(x[0],x[1],x[2],x[3],1) if haversine(x[3], x[2], workGroup[index_work][3], workGroup[index_work][2]) < 0.1 
#                  else (x[0],x[1],x[2],x[3],0) for x in workGroup]
    #workGroup3 = [(x[0],x[1],0)  for x in workGroup if haversine(x[3], x[2], workGroup[index][3], workGroup[index][2]) >= 0.1]        
    
    homeGroup = [x for x in listGroup if datetime.datetime.fromtimestamp(int(x[1])).hour < 6 or
            datetime.datetime.fromtimestamp(int(x[1])).hour > 18]
    
    numNearLocation = []
    i = 0
    for x in homeGroup:
        numNearLocation.append(0);
        for y in homeGroup:
            if haversine(x[3], x[2], y[3], y[2]) < 0.1:
                numNearLocation[i] = numNearLocation[i] + 1
        i = i + 1
    if len(numNearLocation) > 0:
        index_home = numNearLocation.index(max(numNearLocation))
    else:
        index_home = -1
#     homeGroup = [(x[0],x[1],x[2],x[3],2) if haversine(x[3], x[2], homeGroup[index_home][3], homeGroup[index_home][2]) < 0.1 
#                  else (x[0],x[1],x[2],x[3],0) for x in homeGroup]
    
    if index_home != -1 and index_work != -1:
        listGroup = [(x[0],x[1],x[4],1) if haversine(x[3], x[2], workGroup[index_work][3], workGroup[index_work][2]) < 0.01
                 else( 
                    (x[0],x[1],x[4],2) if haversine(x[3], x[2], homeGroup[index_home][3], homeGroup[index_home][2]) < 0.01
                    else (x[0],x[1],x[4],3) 
                    )
                 for x in listGroup]
    else:
        listGroup = [(x[0],x[1],x[4],3)
                 for x in listGroup]
    
    
    listGroup = [(x[0],x[2],x[3],1) if datetime.datetime.fromtimestamp(int(x[1])).hour >= 6 and
                datetime.datetime.fromtimestamp(int(x[1])).hour <= 13
                    else(
                      (x[0],x[2],x[3],2) if datetime.datetime.fromtimestamp(int(x[1])).hour >= 13 and
                        datetime.datetime.fromtimestamp(int(x[1])).hour <= 18
                      else (x[0],x[2],x[3],3)
                    )
                for x in listGroup]
    
#     context for last app used
#     newListGroup = []
#     for i in range(len(listGroup)):
#         if i == 0:
#             newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],-1))
#         else:
#             newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i-1][0]))
#    label with 1 and 0    
#     newListGroup = []
#     for i in range(len(listGroup)):
#         NumberOfLastApp = 50
#         if i < NumberOfLastApp:
#             lastApp = [x[0] for x in listGroup[:i]]
#             if listGroup[i][0] not in lastApp:
#                 newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],0))
#             else:
#                 newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],1))
#         else:
#             lastApp = [x[0] for x in listGroup[i-NumberOfLastApp:i]]
#             if listGroup[i][0] not in lastApp:
#                 newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],0))
#             else:
#                 newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],1))
#     listGroup = newListGroup   
    #least recently used
    l = len(listGroup) 
    numTrain = l * 8 / 10
    numTest = l - numTrain
    trainList = listGroup[:numTrain]      #0.8 train set
    trainList = sorted(trainList,key=lambda x:int(x[1]), reverse=True);  #sort by timestamp with descending
    RecommenderDuplicate =  [t[0] for t in trainList]        #take only id for train set
    Recommender = remove_duplicates(RecommenderDuplicate)    #remove duplicate
    NumberOfLastApp = 6
    if len(Recommender) >= NumberOfLastApp:
        lastApp = Recommender[:NumberOfLastApp]
    else:    
        lastApp = Recommender
    newListGroup = []
    for i in range(len(listGroup)):
        if listGroup[i][0] not in lastApp:
            newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],0))
        else:
            newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],1))
    listGroup = newListGroup  
    #listGroup = [(x[0],x[1],x[2],x[3],1) for x in listGroup]
#     context for frequency
#     newListGroup = []
#     for i in range(len(listGroup)):
#         if i == 0:
#             newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],-1))
#         else:
#             appGourp = [x[0] for x in listGroup[:i]]
#             frquentApp = Counter(appGourp).most_common()[0][0]
#             newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],frquentApp))
#    label with 1 and 0  
#     newListGroup = []
#     for i in range(len(listGroup)):
#         if i == 0:
#             newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],0))
#         else:
#             appGourp = [x[0] for x in listGroup[:i]]
#             frquentApp = Counter(appGourp).most_common()
#             frquentApp = [x[0] for x in frquentApp]
#             numberOfFrequentApp = 4
#             if len(frquentApp) >= numberOfFrequentApp:
#                 frquentApp = frquentApp[:numberOfFrequentApp]                
#             if listGroup[i][0] not in frquentApp :
#                 newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],1))
#             else:       
#                 newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],0))
    #most recently used
    l = len(listGroup) 
    numTrain = l * 8 / 10
    numTest = l - numTrain
    trainList = listGroup[:numTrain]      #0.8 train set
    RecommenderDuplicate =  [t[0] for t in trainList]        #take only id for train set
    Recommender = remove_duplicates(RecommenderDuplicate)    #remove duplicate
    numberOfFrequentApp = 6
    if len(Recommender) >= numberOfFrequentApp:
        firstApp = Recommender[:numberOfFrequentApp]
    else:
        firstApp = Recommender
    newListGroup = []
    for i in range(len(listGroup)):
        if listGroup[i][0] not in firstApp:
            newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],0))
        else:
            newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],1))
    listGroup = newListGroup 
    
    #most frequently used
    l = len(listGroup) 
    numTrain = l * 8 / 10
    numTest = l - numTrain
    trainList = listGroup[:numTrain]      #0.9 train set
    RecommenderDuplicate =  [t[0] for t in trainList]        #take only id for train set
    Recommender = Counter(RecommenderDuplicate).most_common()
    Recommender =  [t[0] for t in Recommender] 
    
    numberOfFrequentApp = 6
    if len(Recommender) >= numberOfFrequentApp:
        frequentApp = Recommender[:numberOfFrequentApp]
    else:
        frequentApp = Recommender
    newListGroup = []
    for i in range(len(listGroup)):
        if listGroup[i][0] not in frequentApp:
            newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],listGroup[i][5],0))
        else:
            newListGroup.append((listGroup[i][0],listGroup[i][1],listGroup[i][2],listGroup[i][3],listGroup[i][4],listGroup[i][5],1))
    listGroup = newListGroup 

    
    #time = datetime.datetime.fromtimestamp(int(line[1][0][1]))
    #line[1][1] = datetime.datetime.fromtimestamp(int("1284101485")).strftime('%Y-%m-%d %H:%M:%S')
    #return line[0],(workGroup+homeGroup)[:20],listGroup[:20]#,len(workGroup+homeGroup),len(workGroup),len(homeGroup)
    return line[0],newListGroup
final = groupData.map(convertLocation)
final.take(1)



In [ ]:

    
from random import shuffle

def topFiveSortedList(oldList, context, probability):
    for x in range(0,len(oldList)-1):
        if x == 0 and oldList[x][1] < probability:
            oldList[x] = (context,probability)
            if oldList[x+1][1] < oldList[x][1]:
                temp = oldList[x+1] 
                oldList[x+1] = oldList[x]
                oldList[x] = temp 
            else:
                break
        elif oldList[x+1][1] < oldList[x][1]:
            temp = oldList[x+1] 
            oldList[x+1] = oldList[x]
            oldList[x] = temp 
        else:
            break
    #return sorted(oldList,key=lambda x: -x[1])
    return oldList      

def remove_duplicates(values):
    output = []
    seen = set()
    for value in values:
        # If value has not been encountered yet,
        # ... add it to both list and set.
        if value not in seen:
            output.append(value)
            seen.add(value)
    return output

def bayesian(line):
    listGroup = line[1]
    #shuffle(listGroup)                  #shuffle the list
    l = len(listGroup) 
    numTrain = l * 8 / 10
    numTest = l - numTrain
    trainList = listGroup[:numTrain]      #0.8 train set
    testList = listGroup[numTrain:]       #0.2 test set
    
    #trainRDD = sc.parallelize(trainList).count()
    newTestList = []
    for t in testList:
        context = [x for x in trainList if x[1]==t[1] and x[2]==t[2] and x[3]==t[3] and x[4]==t[4] and x[5]==t[5] and x[6]==t[6]]
        #context = [x for x in trainList if x[1]==t[1] and x[3]==t[3]]
        numContext = float(len(context))
        if numTrain != 0:
            p_context = numContext/numTrain  #P(C1i, C2j, C3k)
        else:
            p_context = 0
        p_app = [(-1,0),(-1,0),(-1,0),(-1,0),(-1,0)]
        context_no_duplicate = remove_duplicates(context)
        for c in context_no_duplicate:
            appi = [x for x in trainList if x[0]==c[0]]
            numAppi = float(len(appi))
            if numTrain != 0:
                p_appi = numAppi/numTrain
            else:
                p_appi = 0
            contextAppi = [x for x in trainList if x[0]==c[0] and x[1]==c[1] and x[2]==c[2] and x[3]==c[3] and x[4]==c[4] and x[5]==c[5] and x[6]==c[6]]
            #contextAppi = [x for x in trainList if x[0]==c[0] and x[1]==c[1] and x[3]==c[3]]
            if numAppi != 0:    #P(C1i, C2j, C3k | APPid)
                p_contextAppi = len(contextAppi)/numAppi 
            else:
                p_contextAppi = 0
            if p_context != 0:  #P(APPid | C1i,C2j,C3k = P(C1i, C2j, C3k | APPid)  P(APPid) /P(C1i, C2j, C3k)
                p = p_contextAppi * p_appi / p_context
            else:
                p = 0
            p_app = topFiveSortedList(p_app,c[0],p)
        p_app = sorted(p_app,key=lambda x: -x[1])
        app_rec = map(lambda x:x[0],p_app[:5])
        newTestList.append((t[0],app_rec))
    scores = 0
    numHit = 0

    for t in newTestList:
        if t[0] == t[1][0]:
            scores = scores+1.0
            numHit = numHit+1
        elif t[0] == t[1][1]:
            scores = scores+0.8
            numHit = numHit+1
        elif t[0] == t[1][2]:
            scores = scores+0.6
            numHit = numHit+1
        elif t[0] == t[1][3]:
            scores = scores+0.4
            numHit = numHit+1
        elif t[0] == t[1][4]:
            scores = scores+0.2
            numHit = numHit+1
        else:
            numHit = numHit+1
    #scores = scores / numTest
    if numHit != 0:
        scores = scores / numHit
    else:
        scores = 0
    #return newTestList[:20]
    return scores
result = final.map(bayesian)
#result.mean()
f = open('asdf2.txt','a')
f.write(str(result.mean()) + ' last 6 app, first top 6, frequency 6 added according to whole train data with all data')
f.write('\n')
f.close()



In [ ]:

    
from random import shuffle

def topFiveSortedList(oldList, context, probability):
    for x in range(0,len(oldList)-1):
        if x == 0 and oldList[x][1] < probability:
            oldList[x] = (context,probability)
            if oldList[x+1][1] < oldList[x][1]:
                temp = oldList[x+1] 
                oldList[x+1] = oldList[x]
                oldList[x] = temp 
            else:
                break
        elif oldList[x+1][1] < oldList[x][1]:
            temp = oldList[x+1] 
            oldList[x+1] = oldList[x]
            oldList[x] = temp 
        else:
            break
    #return sorted(oldList,key=lambda x: -x[1])
    return oldList      

def remove_duplicates(values):
    output = []
    seen = set()
    for value in values:
        # If value has not been encountered yet,
        # ... add it to both list and set.
        if value not in seen:
            output.append(value)
            seen.add(value)
    return output

def bayesian(line):
    listGroup = line[1]
    #shuffle(listGroup)                  #shuffle the list
    l = len(listGroup) 
    numTrain = l * 8 / 10
    numTest = l - numTrain
    trainList = listGroup[:numTrain]      #0.8 train set
    testList = listGroup[numTrain:]       #0.2 test set
    
    #trainRDD = sc.parallelize(trainList).count()
    newTestList = []
    for t in testList:
        context = [x for x in trainList if x[1]==t[1] and x[2]==t[2] and x[3]==t[3] and x[4]==t[4] and x[5]==t[5] and x[6]==t[6]]
        #context = [x for x in trainList if x[1]==t[1] and x[3]==t[3]]
        numContext = float(len(context))
        if numTrain != 0:
            p_context = numContext/numTrain  #P(C1i, C2j, C3k)
        else:
            p_context = 0
        p_app = [(-1,0),(-1,0),(-1,0),(-1,0),(-1,0)]
        context_no_duplicate = remove_duplicates(context)
        for c in context_no_duplicate:
            appi = [x for x in trainList if x[0]==c[0]]
            numAppi = float(len(appi))
            if numTrain != 0:
                p_appi = numAppi/numTrain
            else:
                p_appi = 0
            contextAppi = [x for x in trainList if x[0]==c[0] and x[1]==c[1] and x[2]==c[2] and x[3]==c[3] and x[4]==c[4] and x[5]==c[5] and x[6]==c[6]]
            #contextAppi = [x for x in trainList if x[0]==c[0] and x[1]==c[1] and x[3]==c[3]]
            if numAppi != 0:    #P(C1i, C2j, C3k | APPid)
                p_contextAppi = len(contextAppi)/numAppi 
            else:
                p_contextAppi = 0
            if p_context != 0:  #P(APPid | C1i,C2j,C3k = P(C1i, C2j, C3k | APPid)  P(APPid) /P(C1i, C2j, C3k)
                p = p_contextAppi * p_appi / p_context
            else:
                p = 0
            p_app = topFiveSortedList(p_app,c[0],p)
        p_app = sorted(p_app,key=lambda x: -x[1])
        app_rec = map(lambda x:x[0],p_app[:5])
        newTestList.append((t[0],app_rec))
    scores = 0
    numHit = 0

    for t in newTestList:
        if t[0] == t[1][0]:
            scores = scores+1.0
            numHit = numHit+1
        elif t[0] == t[1][1]:
            scores = scores+0.8
            numHit = numHit+1
        elif t[0] == t[1][2]:
            scores = scores+0.6
            numHit = numHit+1
        elif t[0] == t[1][3]:
            scores = scores+0.4
            numHit = numHit+1
        elif t[0] == t[1][4]:
            scores = scores+0.2
            numHit = numHit+1
#         else:
#             numHit = numHit+1
    #scores = scores / numTest
    if numHit != 0:
        scores = scores / numHit
    else:
        scores = 0
    #return newTestList[:20]
    return scores
result = final.map(bayesian)
#result.mean()
f = open('asdf2.txt','a')
f.write(str(result.mean()) + ' last 6 app, first top 6, frequency 6 added according to whole train data with all data with quality hits')
f.write('\n')
f.close()



In [ ]:

    
from random import shuffle

def topFiveSortedList(oldList, context, probability):
    for x in range(0,len(oldList)-1):
        if x == 0 and oldList[x][1] < probability:
            oldList[x] = (context,probability)
            if oldList[x+1][1] < oldList[x][1]:
                temp = oldList[x+1] 
                oldList[x+1] = oldList[x]
                oldList[x] = temp 
            else:
                break
        elif oldList[x+1][1] < oldList[x][1]:
            temp = oldList[x+1] 
            oldList[x+1] = oldList[x]
            oldList[x] = temp 
        else:
            break
    #return sorted(oldList,key=lambda x: -x[1])
    return oldList      

def remove_duplicates(values):
    output = []
    seen = set()
    for value in values:
        # If value has not been encountered yet,
        # ... add it to both list and set.
        if value not in seen:
            output.append(value)
            seen.add(value)
    return output

def bayesian(line):
    listGroup = line[1]
    #shuffle(listGroup)                  #shuffle the list
    l = len(listGroup) 
    numTrain = l * 8 / 10
    numTest = l - numTrain
    trainList = listGroup[:numTrain]      #0.8 train set
    testList = listGroup[numTrain:]       #0.2 test set
    
    #trainRDD = sc.parallelize(trainList).count()
    newTestList = []
    for t in testList:
        context = [x for x in trainList if x[1]==t[1] and x[2]==t[2] and x[3]==t[3] and x[4]==t[4] and x[5]==t[5] and x[6]==t[6]]
        #context = [x for x in trainList if x[1]==t[1] and x[3]==t[3]]
        numContext = float(len(context))
        if numTrain != 0:
            p_context = numContext/numTrain  #P(C1i, C2j, C3k)
        else:
            p_context = 0
        p_app = [(-1,0),(-1,0),(-1,0),(-1,0),(-1,0)]
        context_no_duplicate = remove_duplicates(context)
        for c in context_no_duplicate:
            appi = [x for x in trainList if x[0]==c[0]]
            numAppi = float(len(appi))
            if numTrain != 0:
                p_appi = numAppi/numTrain
            else:
                p_appi = 0
            contextAppi = [x for x in trainList if x[0]==c[0] and x[1]==c[1] and x[2]==c[2] and x[3]==c[3] and x[4]==c[4] and x[5]==c[5] and x[6]==c[6]]
            #contextAppi = [x for x in trainList if x[0]==c[0] and x[1]==c[1] and x[3]==c[3]]
            if numAppi != 0:    #P(C1i, C2j, C3k | APPid)
                p_contextAppi = len(contextAppi)/numAppi 
            else:
                p_contextAppi = 0
            if p_context != 0:  #P(APPid | C1i,C2j,C3k = P(C1i, C2j, C3k | APPid)  P(APPid) /P(C1i, C2j, C3k)
                p = p_contextAppi * p_appi / p_context
            else:
                p = 0
            p_app = topFiveSortedList(p_app,c[0],p)
        p_app = sorted(p_app,key=lambda x: -x[1])
        app_rec = map(lambda x:x[0],p_app[:5])
        newTestList.append((t[0],app_rec))
    scores = 0
    numHit = 0

    for t in newTestList:
        if t[0] == t[1][0]:
            scores = scores+1.0
            numHit = numHit+1
        elif t[0] == t[1][1]:
            scores = scores+0.8
            numHit = numHit+1
        elif t[0] == t[1][2]:
            scores = scores+0.6
            numHit = numHit+1
        elif t[0] == t[1][3]:
            scores = scores+0.4
            numHit = numHit+1
        elif t[0] == t[1][4]:
            scores = scores+0.2
            numHit = numHit+1
#         else:
#             numHit = numHit+1
    #scores = scores / numTest
    if numHit != 0:
        scores = scores / numHit
    else:
        scores = 0
    #return newTestList[:20]
    return scores
result = final.map(bayesian)
#result.mean()
f = open('asdf2.txt','a')
f.write(str(result.mean()) + ' last 6 app, first top 6, frequency 6 added according to whole train data with all data for hit rate')
f.write('\n')
f.close()



In [11]:

    
from collections import Counter
a = ((1,1),(1,2),(2,3))
b = [x[0] for x in a]
count = Counter(b).most_common()
count = [x[0] for x in count]
1 not in count

listGroup = [1,1,1,1,1,1]
for i in range(len(listGroup)):
    print listGroup[:i]









    



[]
[1]
[1, 1]
[1, 1, 1]
[1, 1, 1, 1]
[1, 1, 1, 1, 1]



In [ ]: