Continued


In [1]:
import pandas as pd
import json
json_data = open('../views/sample/input00.in') # Edit this to where you have put the input00.in file

data = []
for line in json_data:
    data.append(json.loads(line))

data.remove(9000)
data.remove(1000)

df = pd.DataFrame(data)
cleaned_df=pd.DataFrame(data[0:9000])
data_df = cleaned_df.copy()

In [2]:
# extra libraries
from plotnine import *


/usr/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

More Text Analysis


In [3]:
# The 75th and 90th percentile of the `__ans__` column.
data_df['__ans__'].quantile([0.75, 0.9])


Out[3]:
0.75     4.019578
0.90    12.787143
Name: __ans__, dtype: float64

Let's analyze the frequency of words that show up in question texts that have __ans__ in the 75th or above percentile


In [4]:
# svf stands for seventy-fifth
svf_words = data_df[data_df.__ans__ >= 4.019578][['question_text']].question_text.values
svf_words = ' '.join(svf_words).split()
svf_unique_words = sorted(set(svf_words))

In [5]:
svf_list = [] 
for word in svf_unique_words:
    if len(word) >= 3:
        a = [int(svf_words.count(word)), str(word)]
        svf_list.append(a)

svf_list_df = pd.DataFrame(svf_list, columns=['freq', 'word']).sort_values(by=['freq'], ascending=False)
svf_list_df.describe()


Out[5]:
freq
count 7615.000000
mean 3.025476
std 21.722844
min 1.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 1275.000000

Since it's an intensive task to create a correlation on each of the 7615 words, most of which only appear once anyway, let's instead just work with the most frequent 1,000 words.


In [6]:
# first sort the whole thing
svf_list_df = pd.DataFrame(svf_list, columns=['freq', 'word']).sort_values(by=['freq'], ascending=False)

# pick out the most frequent 1000 words
svf_words_freq_sorted = svf_list_df['word'][:1000].sample(frac=1) # the `sample` method randomizes all the rows (frac=1) after picking the top 1000

In [169]:
# convert in to an array
svf_list_ = svf_words_freq_sorted.values 
svf_list_[0:30] # "head" of the array


Out[169]:
array(['degree', 'experience?', 'way', 'buy', 'When', 'BITS', 'potential',
       'scene', 'Larry', 'sleep', 'With', 'learning?', 'regular',
       'importance', 'outside', 'Web', "don't", 'legal', 'like?', 'called',
       'university', 'progress', 'list', 'development', 'store', 'help',
       'bringing', "you've", 'sort', 'apply'], dtype=object)

To make our task even less intensive (running the analysis, which I will describe below, took longer than 15 mins on the 1000 words), let's do it in chunks. Details below.


In [8]:
# divide svf_list_ into chunks
out = []
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    last = 0.0
    
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

In [9]:
# the array `out` has 20 subarrays containing 50 words
chunkIt(svf_list_, 20)

We will run each word and look up the correlation.


In [10]:
# give it an index and it will return the correlation array of the words in `out[index]`
def corrIt(idx):
    var = []
    for i in range(len(out[idx])):
        a = data_df.question_text.apply(lambda x: 1 if any(pd.Series(x).str.contains(str(out[idx][i]))) else 0)
        var.append(a.corr(data_df['__ans__']))
    return var

In [11]:
# return a sorted dataframe with words and their correlation in appearance
def dfIt(idx):
    corr_list = corrIt(idx)
    mash_df = pd.concat([pd.DataFrame(corr_list, columns=['cor']),pd.DataFrame(out[idx],
                                                                       columns=['word'])],
                   axis=1)
    return mash_df.sort_values(by='cor', ascending=False)

In [12]:
df_19=dfIt(19)
df_19.head()


Out[12]:
cor word
43 0.049485 facts
7 0.042782 girl
1 0.035482 some
11 0.028302 sexual
2 0.027856 team

Would be nice to add the length and frequency.


In [13]:
def enrich(data_f):
    data_f['len'] = data_f['word'].apply(lambda x: len(x)) #adds length
    data_f['freq'] = data_f['word'].apply(lambda x: # grab frequency of the word from `svf_list_df`
                                          svf_list_df[svf_list_df['word']==x].iloc[0].freq)
    return data_f

In [14]:
enrich(df_19).head()


Out[14]:
cor word len freq
43 0.049485 facts 5 14
7 0.042782 girl 4 10
1 0.035482 some 4 212
11 0.028302 sexual 6 5
2 0.027856 team 4 6

In [15]:
(ggplot(df_19, aes('freq', 'cor', color='factor(len)'))+ geom_point())


Out[15]:
<ggplot: (-9223363266706661565)>

In [16]:
df_19['len'].value_counts(bins=4)


Out[16]:
(2.991, 5.0]    26
(5.0, 7.0]      14
(7.0, 9.0]       8
(9.0, 11.0]      2
Name: len, dtype: int64

In [17]:
# divide length into bins
bins = [2.9, 4.9, 7.9, 12] # bins will be effectively be [3, 5) etc
group_names = ['[3, 5)', '[5-8)', '[8-12]']
df_19['len_bins'] = pd.cut(df_19.len, bins, labels=group_names)

In [18]:
df_19.head()


Out[18]:
cor word len freq len_bins
43 0.049485 facts 5 14 [5-8)
7 0.042782 girl 4 10 [3, 5)
1 0.035482 some 4 212 [3, 5)
11 0.028302 sexual 6 5 [5-8)
2 0.027856 team 4 6 [3, 5)

Now let's redo the plot.


In [19]:
(ggplot(df_19, aes('freq', 'cor'))
     + geom_point(aes(color='len_bins'))
     + geom_line(aes(group='len_bins'), size=0.1)
)


Out[19]:
<ggplot: (-9223363266766295229)>

We could use zooming into 0-50 freq as it is apparent that higher frequency words have higher correlation.


In [20]:
(ggplot(df_19[df_19['freq']<50], aes('freq', 'cor'))
     + geom_point(aes(color='len_bins'))
     + geom_line(aes(group='len_bins'), size=0.1)
)


Out[20]:
<ggplot: (-9223363266766316698)>

So the previous conclusion hold for words that are three or four letters long.


In [21]:
(ggplot(df_19[df_19['freq']<15], aes('freq', 'cor'))
     + geom_point(aes(color='len_bins'))
     + geom_line(aes(group='len_bins'), size=0.1)
)


Out[21]:
<ggplot: (8770088592732)>

Nothing more is descernable at this point.


To check our work, let's go through the code. Say, we pick out[19][2], the second word in the 19th slice


In [22]:
print(out[19][2], str(out[19][2]))
data_df.question_text.apply(lambda x: 1 if any(pd.Series(x).str.contains(str(out[19][2]))) else 0).describe()


team team
Out[22]:
count    9000.000000
mean        0.005000
std         0.070538
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: question_text, dtype: float64

The series has 9000 entries of zeroes and one--zero means the question text in that row doesn't contain the keyword out[19][2]=research. So, looking at the mean of the series, we can tell data points with a question text conatining this word are 0.6% of the dataset.

On the other hand, we are calculating the correlation between __ans__ and the above series of 0's and 1's. This effectively tells us if we should weigh question texts with the particular word more.

Now, we will check the correlation in the remaining 19 chunks as we did with the 19th chunk. Simply run corrIt(index).

We also need to get rid of this to avoid multiple repeat errors.


In [23]:
# Searching for `\\` it only appears once as `\\sin` and `++` only appears in `C++`
for i in range(20):
    try:
        if list(out[i]).index('C++') != 999:
            print('C++ was at ', i, list(out[i]).index('C++'))
            out[i][list(out[i]).index('C++')] = 'Cpp' 
    except:
        pass


C++ was at  1 37

In [24]:
for i in range(20):
    try:
        if list(out[i]).index('\\sin') != 999:
            print('\\sin was at', i, list(out[i]).index('\\sin'))
            out[i][list(out[i]).index('\\sin')] = 'sin'
    except:
        pass


\sin was at 2 9

In [25]:
for i in range(20):
    try: 
        if list(out[i]).index('(or') != 999:
            print('(or was at ', i, list(out[i]).index('(or'))    
            out[i][list(out[i]).index('(or')] = 'or'
    except:
        pass


(or was at  14 25

Calculate the correlation of all the lists


In [27]:
corr_big_list = []
for i in range(0,20):
    corr_big_list.append(corrIt(i))

Since the out list is randomized (see svf_list_ section) and because computing corr_big_list takes quite a while, I have saved a copy of a pair of out and corr_big_list as out_1 and corr_big_list_1 for convenience:


In [150]:
corr_big_list_1=[[0.00030816712579101571, 0.0044730016004795375, -0.018184254245652551, -0.011636498757889403, -0.0098418778581828206, 0.026097559924847694, 0.013042976799379082, -0.0024952856521519564, -0.0014278019418653462, -0.0066361332379731679, 0.034104606674625594, -0.0035300160142935299, -0.003388724944028706, -0.00080654880511365698, 0.0064072351728142116, -0.0066817381346244509, 0.00040979589589246629, -0.010358276711574618, 0.024729665673063398, -0.0048034598229090199, 0.1023288271726879, -0.0033370673570963859, 0.035198130429207504, -0.0023873121233479811, -0.0075365026756989712, -0.0043573283771661201, -0.00091066872737137863, 0.0085851641623233832, -0.0069155008482646016, -0.0033476240638101243, -0.00033489503766559448, 0.014024682957343831, -0.0066328075447330643, 0.0077050217122674606, 0.0083476497238622431, 0.00027866044671089101, -0.0046298488103451371, 0.0094317686368927591, 0.0031286448646075495, -0.007465595807550328, -0.0062456567924396708, -0.0051985229784726949, -0.0087520730389423484, 0.0014825103568157643, 0.025447137503987683, 0.0031569477232396693, -0.007618273825201566, 0.022717987737610412, -0.0065577550547866823, 0.0024386167711497784], [-0.0028881589497217167, 0.010970003941513328, 0.0012167271461354528, -0.0021697742804818623, 0.024265919636645681, -0.00024240895328244371, -0.0034342361979813633, -0.0032547763290757593, 0.0020840162391008521, 0.016400747472871784, 0.0037064189243741956, -0.0038322475167536837, -0.0058279101130724415, -0.003531779753513931, -0.0033060631530503915, -0.0047608726154802911, -0.0015200622602567389, -0.010092491967139818, -0.0026847949372279622, -0.014343373247904022, -0.0024851015775787261, 0.0072252372611917833, 0.012332709730471519, 0.02166493696952319, 0.0020093176341874639, 0.0022830828870203054, 0.0027891198322104837, -0.0025104065772374035, -0.00055134449132394057, -0.001007314989035814, -0.0037201198144300551, 0.006633708403083834, -0.005827203317157627, 0.0027196570088708887, 0.013579865687406288, -0.0022465082543230513, 0.0017858139450578983, 0, 0.0013523228325291912, 0.0039347553975315414, 0.042502038772258076, -0.00056345496362009508, -0.0089123906202641118, -0.0029185246867027741, 0.0003741297966530131, -0.006718165630880242, -0.0038299141483705431, 0.00076828958569195448, -0.0073595881806147595, 0.0065152552923246213], [-0.0055662554304464693, 0.0021857985438673176, -0.006157833996541494, 0.020118287899894512, -0.0064322125267556015, -0.0033640023093176388, -0.0034487164022805384, 0.0010694360014762655, 0.010806860685386742, 0.019611096216255855, 0.06473026552187619, -0.007815658265600196, -0.0012049951012723396, 0.0096650532832685109, -0.0081848479110903649, 0.0024824638193696215, 0.045228364686901064, 0.020919625833423588, -0.0057262034628689897, -0.0019546889972717834, -0.0017681380449655051, 0.0019305929838144295, -0.0031441860996190675, -0.0091759626612213149, 0.0018272732681465899, -0.003419877793160774, 0.024590202172987598, -0.0071735609830342996, -0.0019238105997293742, -0.01032902061134779, 0.010247275627760207, -0.00056781109993696277, -0.0074412546495274809, 0.0022089926031274303, -0.0054321061719030551, -0.010937530508012645, 0.0050621539189750044, 0.011163850401181617, -0.00097233345353591328, -0.0067471022553516153, 0.0018617529033470284, -0.011395546776726858, 0.0036492808301568512, 0.026283788933778612, -0.0025309817485629493, -0.0082404999294812854, 0.011175686406548348, -0.00084449398642895326, -0.0038134751677193356, 0.0031286448646075495], [0.0039335300005773734, -0.0027872361334454962, 0.025109668718567041, -0.0024729319841923386, -0.0054221514177400405, -0.0046140021830209455, -0.0029644205787335356, 0.0058658412666491933, 0.0020648373671194958, -0.0030732652382889514, 0.0043183460398299178, -0.0057778457326998888, -0.0052229711454849287, -0.0087179001899529308, -0.0070100485247778727, 0.010058161838999092, -0.013467420834281653, -0.0076595558346031116, -0.006846817797872766, 0.0039897170246467667, -0.00225475456179865, 0.0013765911353118988, 0.057405017749717721, -0.01071715522122787, -0.0063368385283841403, 0.0063244299368342832, -0.0051365796481612006, 0.00025895772891952227, -0.0036898711970211047, -0.004416346605422782, -0.0073325950512096111, -0.0083767904154685175, -0.002655266274509108, 0.0040331504683418593, -0.0059432116691789738, -0.011600644473004301, -0.0027082071049287838, -0.0089337864054543428, -0.0033391570284709042, 0.0056973588788680057, -0.0041113398203742732, -0.0083833449232279429, -0.0019220847118080551, -0.0046328240228199523, 0.052146092864333041, 0.0094031766500344595, -0.0035331066727011832, 0.020810083453388445, -0.0076294707410923378, 0.01827868115925433], [-0.0040999983406437204, 0.0071064687418175591, -0.0041744351036019541, -0.0092409000028602969, 0.0083413418554632458, 0.0061532729187045278, 0.024004010174493714, 0.0082475317939110435, -0.0029356768647843911, -0.0052995819515106448, -0.0029477837556126695, -0.0059717367506664154, -0.0012952588264538371, 0.00021457496458008647, 0.028008637897122796, -0.003350012091366671, 0.0067106966916407112, -0.0017859386698853088, 0.0019353708646550583, 0.015645789981094838, 0.0067197670852938776, 0.0026522558154171249, -0.0003834322656298732, 0.0026174491573469076, 0.0018674784322043995, -0.012823927615131186, -0.0025581337564028828, 0.036261302754747878, 0.02945685709070631, 0.0017672842404350584, -0.026653736529446911, 0.021783953923978368, -0.0063488181493658296, 0.0093811436284397982, 0.0012313161467450836, -0.0027337978275053599, -0.00012914090885128123, -0.0061367746890713549, -0.0037267591462406286, 0.0026482795080635798, -0.0023140109871226957, -0.0039020252953801326, 0.00072875607763547845, 0.0015743850131852326, 0.0035940581761771916, 0.024590202172987598, 0.024655095186671205, 0.014183880200424995, -0.0020420290703361045, -0.014693842607312928], [0.0038503522695805346, -0.0023517091676619525, -0.010318305672876588, -0.00042244225200082811, -0.0039518917421703355, -0.014099966557010814, 0.0024579510388644419, 0.0030376607863483707, 0.00045128384453019477, 0.021650811452954503, 0.0013021710725960036, 0.0083341419580174169, 0.0022297621686237961, -0.0020111774561364479, -0.0071181590003503133, 0.0017916488125445846, 0.01113642310938299, 0.010138389559395696, 0.025112717036490935, -0.0090855374891545132, -0.0082730701654181223, 0.00675544616502422, -0.0058234912322820669, -0.011867624622636167, -0.00083783076087712207, -0.0031845608898710001, -0.0031842325860524097, -0.0069703493237041414, 0.0006256344722183468, 0.0033393274744845792, 0.00046985420465254477, -0.0027841821451274596, 0.0020286587868865386, -0.0021104300455953248, -0.0034687677950377364, -0.0045299810568424003, 0.0047083750935003957, -0.0063750052148376698, 0.0046934237754060384, -0.004856157897193833, 0.0054115145682798035, 0.0051035416338226867, -0.0042193702592235195, -0.0032130289521849786, 0.011696984645701548, -5.8902068559955218e-05, 0.012184247199221634, -0.0033517694188394372, 0.024330266219235487, 0.013076039980225058], [-0.0038451914011656519, -0.00021363649429256967, 5.9090333593991127e-05, -0.0073023692941335643, 0.015061958681791316, 0.009585116790732635, 0.016930010402586914, 0.022387589787546621, -0.0026631965047689875, -0.0051318945834591427, 0.032753509899166221, 0.00041031453723438393, -0.0065245314768517769, -0.0071417760259205295, 0.0027903080571627941, 0.00040036555401640753, 0.028385483975969798, -0.0015095560242410814, -0.007638712730518386, -0.0040741820181012392, -0.010000785517681023, -0.0087611653520964939, 0.011136944368517943, 0.0051680346193349706, 0.0055985008002052026, -0.00080947815204772118, -0.0081609889727837955, 0.0081961287857103184, -0.011227820874658572, -0.010238005881836728, -0.003419877793160774, -0.0021931925323416912, -0.00656274344782567, -0.0088650888517708606, 0.0094240204530220174, 0.0021069579342005264, -0.0027983374335761378, 0.0030171507778535872, -0.01097187825221722, 0.00074066758851370259, 0.0050445268070984867, -0.007870968171528674, -0.010270834859436867, -0.0039036695585955416, -0.005993248340466906, -0.0029204620248914669, -0.009223794131203198, -0.0076433156190382915, -0.0012567290676569255, -0.0052400047482109554], [0.0038915939735842831, 0.010419525081562821, -0.0014618143848855159, 0.010263483727329549, -0.010147434195145164, -0.0016170715462519084, -0.0025132968406928499, -0.0031254701326580581, -0.0032887401182045213, -0.0034556989966380669, -0.012067012166186676, 0.0058702909130399997, 0.012847738210386711, 0.00013033764649521306, -0.0018411710863566961, 0.0039131486150130656, 0.002314308546943724, -0.0040535444224627697, 0.0050241272805715371, -0.001702124258244042, 0.0075564504686124068, 0.001550024429751934, -0.002887086432830136, 0.0072807186084017771, 0.0029205697320207231, 0.00081900061778488495, 0.0023003886197646818, 0.0032932866844910705, 0.009399055035279498, -0.0017197794980169281, -0.0012322012735645216, -0.0052786005677411627, -0.0088920062953825232, 0.03970866645777648, -0.0057586510024742726, 0.0025391228728078299, -0.00021039548638815764, -0.014662151053035876, -0.010048852910795273, -0.0040067115025220366, -0.0053217909670061174, 0.0029828715085204946, -0.0021048820455902063, 0.0035449319017630287, 0.00069485710144022265, 0.03596378835673901, -0.00057400669361574205, 0.018226979955979515, -0.0032560820862573042, 0.002008256246220017], [-0.015038653093675068, 0.01052483364975592, -0.0079863665852985873, -0.0023034061867802176, 0.0017043042379101528, -0.0061036426811022894, 0.003225738154128278, -0.0037952025903429682, 0.0021528158939119794, -0.002769857822606899, 0.0019854337272486576, 0.030264366126329077, 0.010970015145314302, 0.0082252611128555524, -0.014575735929070343, -0.0026135399341919627, 0.0067656588201241716, 0.0035870745225404232, -0.0020654670087214807, -0.00025227353405862633, -0.00066890175365683709, 0.0067113280622261521, -0.0063130420494131728, 0.0063184052216883734, 0.0037607464480548677, -0.0045919200550520282, -0.0053460344591380405, -0.0032186246376210274, -0.0025581337564028828, -0.0070792437043274712, 0.045281035540934676, -0.0019972461158679657, -0.0074370659997657967, 0.029401468531120169, -0.00095022065973632665, -0.0057784924425085806, -0.005935098965299446, 0.00027670428440446494, 0.0045496340505188334, -0.0038764324918879062, -0.0030744393824485073, -0.0045624099980782181, -0.0046489715113922402, -0.0059985173639422349, 0.011496458322218236, 0.008751826213773484, -0.010452830003218071, -0.0040717383453025766, -0.0090070215124744505, -0.0040278223122745998], [0.0083550610277740023, -0.029050900883528373, -0.0059106816833672231, 0.0025835362973095173, -0.00047749693317631898, 0.018436905510209354, -3.8100417005802405e-05, -0.0055563259956187333, 0.0086672026387371922, 0.00063187176136450995, 0.00037380900035735196, -0.0012005370222728615, -0.0049455635833718049, -0.0018319569495935028, -0.005709456317318686, -0.009105033441588176, -0.0023731954146389753, -0.0015316030560129272, -0.0060180999470846449, -0.0038337183122445278, -0.0050635482441812284, 0.011996320748298336, 0.0043345341132875279, -0.0060829850440349388, 0.00078186190151715142, -0.0065426159576585695, 0.0011657955259697095, 0.022621877650546317, 0.0054682153838136009, -0.0098564257470907576, -0.0040672990175107671, -0.0047926150452541382, 0.012519106443645582, 0.0037378607394021532, 0.0012509562605219327, -0.0071523724079871447, 0.0051035416338226867, -0.003997929253643724, -0.017743068830958764, -0.0020268299884030048, 0.00037163709034099696, -0.0081609889727837955, 0.0026093962558993906, -0.0059053378170682881, 0.0095691906133355611, -0.0012167806386807931, -0.008495618197349666, -0.0050955885526389631, 0.0071940750924432886, 0.012626858271881676], [-0.0062033595185335116, -0.0044518895663623517, 0.010122869733204186, 0.0052590591324923261, -0.0017133949744627213, -0.0052020701301213976, -0.0027011730028023064, -0.0035300160142935299, 0.012526541348959797, 0.011307020773258479, 0.011127857534022165, -0.0021683129538185423, -0.0031086399060015427, 0.008751826213773484, 0.010192807934568253, -0.0058626654583513278, -0.0057782506601284471, -0.0026096042760198833, -0.0035230359831737402, -0.0070266762569692635, 0.017119928387593034, -0.0018287755740245776, -0.0052865659125757745, 0.0061532729187045278, -0.0060899997886011354, -0.0039920757611741553, 0.0010018136069110566, -0.0032277025135943717, 0.0023945481447076402, -0.0003937682063627097, -0.0059644708495630271, 0.0067197670852938776, -0.0062245675277347815, -0.00041636302583883882, -0.005404639563105028, -0.0054455124177861568, 0.0098012868366185585, -0.010238308303535339, -0.00079942227911325371, -0.0056243488165086866, 0.048650643880207463, 0.007901494859536189, 0.0096640776546497859, 0.021838105204411987, 0.018903671245551652, -0.0035459246645856732, 0.0010694360014762655, 0.0025867626300677495, 0.0035236311152237938, 0.026537827187008613], [0.0079532169686003466, 0.004982873762155819, 0.0082475317939110435, -0.0059805694251638588, -0.0071850066596413648, -0.001220919583676416, 0.012690952660123449, -0.0074786033472630476, -0.0025472465008443534, 0.00034019764663567419, -0.0078830509781961048, -0.0070709827639764038, -0.0034518435675685525, 0.00062669661274122922, -0.0067005333108912371, -0.00020198578267311394, -0.0056746449828797138, -0.014751886177321712, -0.00019718970001080682, -0.0011064547317243428, -0.0036295370177789514, -0.0061616445228561646, 0.00078559356232328734, -0.0059289095438923274, 0.0032335371246084831, 0.0023673408825878651, -0.0026761516318406994, -0.0089139720371155268, -0.0015100285871206538, -0.0041163601446803869, -0.011970782074637518, -0.0058268009433599874, -0.0011529303586283719, -0.0028351384834251069, 0.0085715687236584456, -0.0028711414578712438, -0.0035675113215551577, 0.0064486515228663532, 0.00066490359591693635, 0.0019569901055810487, -0.0029798744279565968, 0.0052335700839740654, -0.0050921547961718053, -0.0016731643899604532, 0.0068279774697896575, -0.01097187825221722, -0.0049064981602198431, 0.0052552783055297781, -0.008542719756438227, 0.010249949199206944], [0.0067511968300894903, 0.01074805457971997, 0.0048257191735579345, 0.0512523800623211, 0.0070232356282575467, 0.0021657086982868148, -0.0036049896889845507, 0.01557247414537136, 0.012710019789090081, -0.0016467322913514856, -0.0025895005870781732, -0.001207033861826496, -0.0084358836143301249, 0.004073308020897521, -0.0054232749337083429, -0.011140710146534567, -0.0073726276234351339, -0.0024615635962582918, -0.0079223903990460892, 0.0025891567456240005, -0.0025938251481470997, -0.0094730761346421832, 0.0098950230562226838, -0.011570294939930082, -0.0037208554841798706, -0.0037664071327770544, -0.0054443717842090908, -0.010170767261880379, 0.021720509764465865, 0.027566755054238782, -0.0014204949086361678, -0.00067570561456134709, -0.0062955698076899895, -0.0070369881036013073, 0.0037570868752071647, -0.0038073323829991715, 0.00063709037595732982, -0.0099654500974158903, 0.0019289567514207307, -0.0030725575177697078, 0.013224318848091166, 0.0090585549607551438, 0.012125487227658941, -0.0041035459076260213, -0.0037184681746698835, -0.0058574906817098656, -0.0011012297309672618, 0.0065438455956716331, -0.0058987830850976654, 0.0098185565063758744], [-0.012654242717889164, -0.0025672486694806335, -0.0059432116691789738, -0.0024592655951068501, -0.010110302732871794, 0.0071920269311909583, -0.00027560280488812431, -0.0035401219539874791, 0.0041007750552731013, -0.0042252191122454409, -0.0023071670851554623, 0.0023041824897939052, -0.0056328047875145388, 0.012184247199221634, 0.0015414206672716782, -0.0056902740887640009, 0.0083012909377272633, -0.01122654036800845, -0.0073595881806147595, 0.0034273901043434438, -0.0030985902617879043, 0.011264638317512199, 0.002264724791756427, 0.083267258086572302, 0.002363614050036046, -0.015750555377159924, -0.0015402932161761242, -0.0043299906415134731, 0.0073033837659305005, 0.0034956063277067628, 0.0017565046652500006, -0.0030831483356614348, 0.028146282369618601, 0.022387589787546621, -0.0025617014493173134, 0.023671991914385355, -0.00024240895328244371, 0.0038770481984026638, -0.010939882288173936, 0.020665870016532816, 0.0089047764907555938, -0.0085764897976901375, -0.0044136370500696956, 0.001812700921538562, -0.00011281897515511597, -0.0045931805225767376, 0.010419525081562821, -0.0048894707574018799, 0.02014725179242742, 0.0047104180633297018], [-0.0096014833278657095, -0.0084035894378828285, -0.007687104334526647, -0.0032698757357000007, -0.0002490757315802865, 0.0094831683873128451, -0.0060135313225925966, 0.0052814559136878077, -0.0082609161488176701, -0.0065815953517476293, 0.028911686398659542, -0.0062387841699766636, 0.018649412096706459, -0.0086387514022342529, 0.023835397088238858, -0.007200872443893544, -0.0039865446015948285, -0.0014114544661985649, 0.0089546843910136235, 0.0010229880009275955, 0.0048040834877872071, 0.012845813029980504, 0.0058840457340290754, 0.0026560074152205569, 0.063296508519526104, -0.014224800403898305, 0.03596378835673901, 0.018608853514283284, -0.01426558554787689, 0.0029227349201175552, -0.0051281340547188721, -0.0079807481750164885, -0.010372853199477264, -0.012906187365057579, 0.0019139898264629444, 0.0017090667576043609, 0.015070821571564853, 0.0042709081049846819, -0.0025869853028361884, 0.025918395780214397, 0.028273746629177299, -0.010419581778707108, -0.0075044081473940368, -0.0030751222228698177, -0.0069669090572028929, -0.010276761750972554, -0.0012421896632617297, 0.0072527721572508384, -0.0056635312636723429, 0.0058702909130399997], [-0.0087802015564753776, 0.0061344240131735407, -0.0040490001808132286, 0.0011034330032901444, -0.0069003137220430477, 0.00088576454887124548, -0.0014600426102404258, 0.015098060940807928, 0.024048530641768186, -0.0012386590124490863, 0.00084437557204774226, -0.0053624428469605679, -0.0071602930098101569, -0.0036063664453677667, -0.021591582007980128, 0.0029803689648313786, 0.0051905326298402445, -0.0090061549353531358, 0.0082970732779835635, -0.0089889735369055521, 0.0038477795212605, -0.0052167134225994255, 0.013478903863842984, -0.0055043788146595263, -0.0061685904678544286, -0.0066423722460146727, 0.00015302192820175906, 0.0029769234256276102, 0.0022653130769389293, 0.0079105584421877029, 8.0279506318141298e-05, 0.0073033837659305005, 0.0037936640567481942, -0.0058206070188344819, -0.010046619303296013, 0.0017471093374766327, 0.017561320521540805, -0.021208327242835886, 0.014333644638912358, -0.0064693173896922286, -0.0030789136219182131, 0.052525470021713122, 0.021884857026661174, 0.0027737858726483047, 0.0090359417144925489, 0.0052910367946880947, 0.0088355701438497872, 0.0043544085196156115, 0.0069199902571643818, -0.0027207475289445565], [-0.0042112720095504154, 0.020962190480557689, 0.0082194129759970112, -0.0014268125649219297, -0.004344523384250956, 0.0094924454396460734, 0.0047083750935003957, 0.00027670428440446494, -0.0089440836654303083, -0.0090987944734081429, 0.0018513772554416652, 0.028273746629177299, 0.0023779281251868054, -0.0062296722666043847, -0.0085313780311072132, -0.0016545405254824481, -0.0058218981492385059, 0.0046241950845224439, 0.0041552759059361507, -0.00054828337850201803, -0.0045551229627827901, -0.0081722321955866421, -0.00854946548166843, 0.008751826213773484, 0.0018408717624664229, -0.0088920062953825232, 0.07906941671999583, -0.0011522942112940616, 0.0022941482735665433, 0.009399055035279498, -0.0023376183240288776, -0.0032074811530607909, -0.0071047299300351632, 0.01944013792475649, -0.0053471544470128614, 0.0018023280434369519, -0.0023979830208241014, 0.051545697844733063, -0.0015038968258339158, -0.0008623064155058775, -0.0010626848077561859, 0.008560792079496761, 0.0022219119173328074, -0.0081971187196068392, -0.0058424591442238849, 0.00066339016577460436, -0.0019308599849549438, 0.061596171542990123, -0.0018813873548246653, 0.0090658713789474026], [-0.017845640815911449, -0.0031020998236286458, 0.0074217967909285974, 0.0013815752082004041, 0.004554731374488967, -0.01032902061134779, 0.0029333064592933439, -0.0018943445250411608, -0.012346057208533035, 0.0040005905863377024, -0.0071149718237795536, -0.0056675424774434392, -0.0038179001426877179, -0.0087527770065764544, 0.0016277008144263154, -0.011441660383957795, -0.004207129077700063, 0.0031956332922284613, 0.0075273802096894032, -0.0058396240910732373, 0.0039726482495901156, 0.0072165595320878746, -0.0036138256007556984, 0.0024203640999713397, -0.0034887990135648726, -0.0025637511853339257, -0.0051918327709520786, -0.0019197293469865899, -0.010589441659839027, 0.02500362712556695, -0.0023196604677095055, -0.0020271855974564185, 0.0010563758249445066, 0.00033685524899888249, -0.0052017760590021885, -0.0023831774829495105, -0.00053559730589721583, -0.0041271314824054993, -0.0063736038491123172, -0.010937732581657691, 0.024243625733129215, -0.0070185398944262213, 0.0032185303999392365, 8.3203170942328092e-05, -0.0054188180326979751, 0.0012008628335259005, -0.0048832318789797996, 0.03092249831082082, -0.0037854325566301708, -0.0034375517588172075], [0.00015813515885737905, -0.0011657729106589493, -0.0033258425977705963, 0.0093934720462424577, 0.04532092713176402, -4.8400482391087285e-05, 0.0055160873885184494, 0.0027685958180729041, -0.0092795756891307021, -0.0029449826597074675, 0.021714147994139454, 0.003045933521284694, -0.007468941900957266, 0.01498902065157781, -0.0089933393108274483, -0.0037820678942625773, 0.01736522546247465, -0.0025581337564028828, -0.0060658737973498271, -0.0072219553960532964, 0.0394676196235344, -0.0031088576063752135, 0.0045836168511889956, -0.0089691832493192254, -0.0014407189110933116, -0.015159438994250398, 0.016402853375735934, -0.0098402215481742505, 0.0065192793324558426, 0.0037161046354960755, -0.0050913630481300648, 0.014809808503917271, -0.0039102388101285241, -0.005080855427060515, -0.0049834039055323756, 0.014910440909068373, -0.00059868023197447802, 0.0032719072552370942, 0.010183852139492638, -0.0014403590397732864, -0.010937732581657691, 0.002570934732080235, 0.012689151605211675, 0.0066254286175235471, -0.0026220100540470187, -0.003590299609605033, -0.0010724707894585433, -0.0059423737247605657, -0.0012804402014583382, -0.0019350176014494075], [-0.0028014086225296806, 0.035482090202056657, 0.027856121186101635, 0.005974594875636864, -0.0013263464474456029, 0.0020602344846144473, 0.027423998401378422, 0.042781972805756657, -0.011644335566401787, -0.0021258793391432346, 0.0010452484601072153, 0.028302345251363388, 0.0058991575936129404, -0.0029211774171432688, 0.0018606874621000789, 0.00042462392202935689, -0.0046787216688437957, 0.0011657955259697095, 0.0012008628335259005, 0.005974594875636864, -0.0092852834952489305, -0.0015180298606563363, -0.0054108166068269651, 0.020741808957365951, 0.0052551477092289056, -0.0077721593921502461, -0.013007736290457726, 0.0096358087564050003, 0.0036732647690840181, -0.0040976931857197871, 0.001479998347060363, -0.0070615369329555854, 0.0050205856267242438, 0.0096650532832685109, -0.0049468678995842306, 0.0077092618353251574, 0.00064727155579378116, -0.0016234058822858059, -0.0046065190435684933, 0.0080884437426522945, 0.00071201967042957811, 0.0032932866844910705, 0.016524045060611937, 0.049485131558357411, -0.0086049260357222157, -0.0029833185113560348, 0.00037660084371365622, -0.0082577743215483303, -0.00039780810888356019, 0.0001964981046567112]]

In [155]:
out_1 = [['degree', 'experience?', 'way', 'buy', 'When', 'BITS', 'potential', 'scene', 'Larry', 'sleep', 'With', 'learning?', 'regular', 'importance', 'outside', 'Web', "don't", 'legal', 'like?', 'called', 'university', 'progress', 'list', 'development', 'store', 'help', 'bringing', "you've", 'sort', 'apply', 'according', 'were', 'likely', 'island', 'heavy', 'jobs', 'given', 'now', 'program?', 'sell', 'share', 'clear', 'money?', 'encryption', 'character', 'years', 'between', 'build', 'differences', 'points'], ['matter', 'times', 'planning', 'decide', 'shown', 'women?', 'suddenly', 'raised', 'technical', 'random', 'considering', 'classes', 'than', 'song', 'Korean', 'android', 'changing', 'popular', 'was', 'only', 'asked', 'IIT', 'case', 'have', 'entrepreneur?', 'famous', 'reliable', "someone's", 'kinds', 'law', 'deal', 'significant', 'evidence', 'work?', 'things', 'research', 'all', 'Cpp', 'since', 'advantages', 'physics?', 'custom', 'brand', 'Germany?', 'object', 'process', 'ideas', 'Internet', 'business?', 'society'], ['hard', 'killing', 'career', 'about', 'how', 'getting', 'consider', 'machine?', 'important', 'sin', 'single', 'look', 'his', 'school', 'average', 'who', 'finding', 'least', 'light', 'visiting', 'Japanese', 'why?', 'space', 'general', 'run', 'the', 'college', 'country', 'added', 'account', 'Marc', 'photo', 'names', 'community', "I'm", 'company', 'society?', 'watch', 'following', 'happened', 'really', 'online', 'mistakes', 'around', 'seen', 'live', 'starting', 'Has', 'James', 'program'], ['basic', 'parents', 'come', 'Bangalore?', 'both', 'Top', 'microsoft', 'greatest', 'leading', 'return', 'books', 'job', 'paid', 'design', 'option', 'behind', 'car', 'information', 'vs.', 'economy?', 'Japan', 'little', 'mind', 'past?', 'still', 'been', 'sports', 'U.S.', 'done', 'North', 'player', 'possible?', 'sending', 'creative', 'answer?', 'Where', 'needs', 'idea', 'continue', 'languages', 'trust', 'during', 'create', 'quality', 'Indian', 'analysis', 'Which', 'prepare', 'possible', 'believe'], ['pictures', 'you?', 'MBA', 'own', 'economic', 'programming', 'would', 'people?', 'creating', 'kill', 'short', 'contact', 'any', 'escape', 'inspired', 'JavaScript', 'coding', 'sources', 'death?', 'win', 'campus?', 'strong', 'animal', 'global', 'animals', 'does', 'startup?', 'interesting', 'most', 'understand', 'for?', 'Why?', "can't", 'Facebook', 'employees', 'why', 'shows', 'else', 'moved', '2013', 'songs', 'summer', 'oil', 'win?', 'Wales', 'college?', 'like', 'photography', 'point', 'app'], ["isn't", 'digital', "What's", 'conference', 'cool', 'day?', 'last', 'Jimmy', 'Will', 'value', 'staying', 'its', 'picture', 'path', 'small', 'model?', 'you', 'University', 'guy', 'back', 'same', 'industry?', 'software', 'good', 'think', 'never', 'over', 'able', 'beginning', 'affect', 'series', 'steps', 'education', 'being', 'nuclear', 'exist?', 'Google', 'fast', 'knowledge', 'working', 'them', 'future', 'active', 'launched', 'near', 'write', 'algorithms?', 'US?', 'movies', 'math'], ['numbers', 'mathematics', 'Java?', 'science', 'know', 'well', 'left', 'movies?', 'fall', 'position', 'instead', 'energy', 'New', 'send', 'that', 'with?', 'most?', 'educational', 'serve', 'view', 'her', 'anyone', 'Who', 'placement', 'Islam', 'required', 'name', 'history', 'service', 'video', 'them?', 'rate', 'already', 'post', 'dollar', 'worst', 'analyst', 'The', 'others?', 'read', 'girls', 'need', 'could', 'while', 'through', 'adding', 'question', 'take', 'based', 'government'], ['can', 'decision?', 'businesses', 'correct', 'using', 'dark', 'social', 'considered', 'areas', 'enjoy', 'down?', 'Python?', "Isn't", 'cause', 'happen', 'And', 'better', 'War', 'IITians', 'explain', 'inspiring', 'stupid', 'kept', 'declared', 'period', 'finance', 'Americans', 'day', 'power?', 'price', 'How', 'having', 'website', 'indian', 'startups', 'relative', 'hiring', 'company?', 'compare', 'support', 'among', 'scientific', 'Michael', 'corporate', 'employee', 'girlfriend?', 'virtual', 'advice', 'details', 'reduce'], ['ways', 'thing', 'she', 'spend', 'dangerous', 'City?', 'hacks', 'before', 'spread', 'end', 'him', 'physical', 'real', 'life?', 'effect', 'playing', 'because', 'high', 'hire', 'consumer', 'terms', 'safe', 'number', 'jokes', 'Kickstarter', 'daily', 'players', 'per', 'startups?', 'screen', 'across?', 'saved', 'pros', 'cross', 'hurt', 'computer', 'and/or', 'language', 'read?', 'success', 'bad', 'background', 'very', 'form', 'now?', 'year', 'search', 'Best', 'film', 'said'], ['police', 'for', 'model', 'engineer', 'Delhi', 'hate', 'secret', 'direct', 'prevent', 'student', 'medical', 'has', 'estate', 'against', 'standard', 'just', 'word', 'ability', 'actions', 'besides', 'Was', 'problems', 'study', 'coffee', 'embedded', 'group', 'marriage?', 'not', 'favorite', 'apps', 'professor', 'much', 'are', 'Did', 'electricity', 'private', 'future?', 'turn', 'Does', 'avoid', 'season', 'names?', 'current', 'hotel', 'once', 'products', 'get', 'tips', 'site?', 'Apple'], ['services', 'house', 'up?', 'history?', 'english', 'cons', 'Microsoft', 'learning', 'from', 'significance', 'wants', 'normal', 'water', 'year?', 'What', "one's", 'personal', 'great', 'kind', 'launch', 'predict', 'natural', 'USA', 'programming?', 'political', 'e-commerce', 'feel', 'recommendation', 'ever?', 'should', 'train', 'campus', 'family', 'Amazon', 'enough', 'longer', 'improve', 'play', 'highest', 'chance', 'actually', 'art', 'love', 'modern', 'teach', 'desktop', 'machine', 'email', 'often', 'found'], ['out', 'plan', 'people', 'scene?', 'control', 'rank', 'technology', 'child', 'learn', 'time', 'percentage', 'increase', 'yet', 'etc.?', 'full', 'something', 'data', 'and', 'laptop', 'known', 'months?', 'even', 'Kim', 'receive', 'anything', 'connections', 'type', 'credit', 'had', 'choose?', 'where', 'USA?', 'amount', 'late', 'biggest', 'Indians', 'taken', 'marketing', 'foreign', 'Computer', 'Windows', 'types', 'area', 'successful', 'amazing', 'other', 'minimum', 'capital', 'man', 'there'], ['theory', 'Valley?', 'become', 'India?', 'new', 'state', 'detect', 'did', 'stories', 'application', 'increasing', 'you,', 'country?', 'worth', "doesn't", 'two', 'same?', 'Wikipedia', 'content', 'explanation', 'old', 'long', 'under', 'effective', 'alcohol', 'add', 'either', 'pay', 'but', 'each', 'Have', 'violent', 'early', 'next', 'project', 'financial', 'students', 'companies', 'show', 'memory', 'investor', 'beautiful', 'start', 'events', 'United', 'see', 'soon', 'tech', 'say', 'start?'], ['one', 'posts', 'answer', 'everyone', 'music', 'animated', 'might', 'smart', 'like,', 'laws', 'mean', 'consecutive', 'options', 'algorithm', 'lot', 'courses', 'made', 'person', 'business', 'relationship', 'words', 'convince', 'those', 'within', 'size', 'age', 'goes', 'sign', 'engineering?', 'make', 'necessary', 'features', 'physics', 'movie', 'needed', 'tell', 'women', 'life', 'mobile', 'widely', 'major', 'place', 'websites', 'engine', 'profile', 'camera', 'decision', 'solution', 'Jobs', 'master'], ['free', 'far', 'money', 'PhD', 'travel', 'low', 'rather', 'iPhone', 'web', 'without', 'changes', 'their', 'this?', 'recommend', 'close', 'international', 'military', 'result', 'they', 'scientists', 'such', 'Silicon', 'prove', 'learned', 'intelligence', 'or', 'girlfriend', 'Steve', 'off', 'find', 'yourself', 'allow', 'more?', 'more', 'quantum', 'join', 'media', 'eat', 'level', 'keep', 'world', 'food', 'alternative', 'human', 'places', 'used', 'survive', 'Masters', 'certain', 'Python'], ['available', 'opinion', 'consulting', 'grow', 'going', 'scope', 'into', 'sex', 'Glass?', 'powerful', 'seek', 'members', 'few', 'role', 'men', 'writing', 'challenging', 'put', 'industry', 'Are', 'Would', 'after', 'what', 'stock', 'page?', 'blogs', 'cost', 'Your', 'change', 'access', 'resources', 'engineering', 'makes', 'develop', 'coming', 'triangle', 'craziest', 'age?', 'of?', 'public', 'criteria', 'sound', 'iOS', 'crash', 'stop', 'Quora,', 'America', 'salary', 'lines', 'Area'], ['along', 'me?', 'recent', 'opposite', 'field', 'Ubuntu', 'Google?', 'language?', 'source', 'this', 'attacks', 'world?', 'programmers', 'these', 'always', 'buying', 'questions', 'wanted', 'sense', 'gets', 'course', 'move', 'making', 'years?', 'humans', 'website?', 'mind-blowing', 'every', 'answers', 'power', 'leave', 'photos', 'San', 'heard', 'methods', 'brain', 'someone', 'India', 'market?', 'time?', 'originated', 'God', 'book', 'difference', 'another', 'main', 'B2B', 'universe?', 'universities', "aren't"], ['use', 'too', 'got', 'reality', 'plausible', 'account?', 'myself', 'attention', 'part', 'despite', 'looking', 'English', 'lead', 'less', 'our', 'down', 'google', 'telling', 'it?', 'phone?', 'anything,', 'started', 'Chinese', 'big', 'doing', 'pursue', 'cloud', 'President', 'city', 'right', 'product?', 'call', 'which', 'Java', 'professors', 'population', 'American', 'currently', 'game', 'Quora', 'Why', 'Bay', 'large', 'nothing', 'phones?', 'system?', 'download', 'cases', 'countries', 'NBA'], ['investment', 'national', 'civil', 'invest', 'boyfriend', 'give', 'lives', 'growth', 'tools', 'videos', 'efforts', 'wrong', 'different', 'cannot', 'home', 'reading', 'dollars', 'startup', 'Twitter', 'will', 'true', 'shoot', 'favourite', 'be?', 'product', 'Can', 'projects', 'then', 'living', 'do?', 'apart', 'know?', '2012?', 'difficult', 'link', 'out?', 'soccer', 'with', 'earth?', 'attractive', 'Quora?', 'want', 'simple', 'wearing', 'choose', 'similar', 'common', 'provide', 'stay', 'except'], ['restaurants', 'some', 'team', 'meaning?', 'reasons', 'happens', 'top', 'girl', 'best', 'World', 'many', 'sexual', 'Bill', 'regarding', 'morally', 'running', 'married', 'marriage', 'system', 'meaning', 'ask', 'cold', 'moment', 'key', 'examples', 'when', 'work', 'young', 'vacation', 'maximum', 'century?', 'sites', 'experience', 'school?', 'platform', 'text', 'mothers', 'night', 'order', 'ever', "didn't", 'days?', 'story', 'facts', 'Should', 'first', 'ten', 'facebook', 'IITs', 'your']]


In [156]:
print('\n')
print('Min')
for i in range(len(corr_big_list_1)):
    print(i, min(corr_big_list_1[i]))
print('\n')
print('Max')
for i in range(len(corr_big_list_1)):
    print( i, max(corr_big_list_1[i]))



Min
0 -0.01818425424565255
1 -0.014343373247904022
2 -0.011395546776726858
3 -0.013467420834281653
4 -0.02665373652944691
5 -0.014099966557010814
6 -0.011227820874658572
7 -0.014662151053035876
8 -0.015038653093675068
9 -0.029050900883528373
10 -0.010238308303535339
11 -0.014751886177321712
12 -0.011570294939930082
13 -0.015750555377159924
14 -0.01426558554787689
15 -0.02159158200798013
16 -0.009098794473408143
17 -0.01784564081591145
18 -0.015159438994250398
19 -0.013007736290457726


Max
0 0.1023288271726879
1 0.042502038772258076
2 0.06473026552187619
3 0.05740501774971772
4 0.03626130275474788
5 0.025112717036490935
6 0.03275350989916622
7 0.03970866645777648
8 0.045281035540934676
9 0.022621877650546317
10 0.04865064388020746
11 0.012690952660123449
12 0.0512523800623211
13 0.0832672580865723
14 0.0632965085195261
15 0.05252547002171312
16 0.07906941671999583
17 0.03092249831082082
18 0.04532092713176402
19 0.04948513155835741

In [157]:
corr_big_list_1[0].index(0.1023288271726879)


Out[157]:
20

In [158]:
out_1[0][20]


Out[158]:
'university'

Ok university does really well compared to others, with correlation of 0.10. What if we look at the combination of the top 3 words.


In [162]:
corr_big_list_1[13].index(max(corr_big_list_1[13]))


Out[162]:
23

In [161]:
out_1[13][23]


Out[161]:
'within'

In [160]:
corr_big_list_1[16].index(max(corr_big_list_1[16]))


Out[160]:
26

In [159]:
out_1[16][26]


Out[159]:
'mind-blowing'

In [39]:
#gives the correlation of `__ans__` to the product of num_answers and the ...
# ... boolean column of number of characters less than int_

def CorrOR(str_):
    split = str_.split(', ')
    joined= '|'.join(split)
    # create a pd series with boolean values 
    combined_df = data_df.question_text.apply(lambda x: 1 if any(pd.Series(x).str.contains(str(joined))) else 0)
    prod_df =  combined_df * data_df['num_answers']
    return combined_df.corr(data_df.__ans__), prod_df.corr(data_df.__ans__)

In [40]:
CorrOR('university, within, mind-blowing')


Out[40]:
(0.095112383288031968, 0.35205210234998646)

Let's do even more combinations.


In [53]:
# First let's order them by their coefficient of correlation

In [163]:
more_words = []
for i in range(20):
    idx_1 = i
    idx_2 = corr_big_list_1[idx_1].index(max(corr_big_list_1[idx_1]))
    more_words.append(out[idx_1][idx_2])

In [164]:
more_words


Out[164]:
['university',
 'physics?',
 'single',
 'mind',
 'interesting',
 'guy',
 'instead',
 'indian',
 'across?',
 'not',
 'actually',
 'technology',
 'India?',
 'within',
 'intelligence',
 'sound',
 'mind-blowing',
 'cases',
 'boyfriend',
 'facts']

In [48]:
more_words_str = ', '.join(more_words)
more_words_str


Out[48]:
'university, physics?, single, mind, interesting, guy, instead, indian, across?, not, actually, technology, India?, within, intelligence, sound, mind-blowing, cases, boyfriend, facts'

In [49]:
CorrOR(more_words_str)


Out[49]:
(0.082244785621280792, 0.27371488556060108)

In [52]:
CorrOR('university, physics?, single, mind, interesting, guy, instead, indian, across?, not, actually, technology, India, within, intelligence, sound, mind-blowing, cases, boyfriend, facts')


Out[52]:
(0.076683177038129921, 0.25123802721996741)

Notice that the second highest coefficient correlation could be in the list containing the first highest one; if that the case, we have missed it when we picked the top 3 above. To avoid this let's pick the top 3 in each list in corr_big_list_1.


In [151]:
top_3_list = []
for i in range(0, 20):
    list_= corr_big_list_1[i]
    a_1 = [x for x in list_ if not x == max(list_)]
    a_2 = [x for x in a_1 if not x == max(a_1)]
    l_nn = [max(list_), max(a_1), max(a_2)]
    top_3_list.append(l_nn)

In [152]:
top_3_list


Out[152]:
[[0.1023288271726879, 0.035198130429207504, 0.034104606674625594],
 [0.042502038772258076, 0.02426591963664568, 0.02166493696952319],
 [0.06473026552187619, 0.045228364686901064, 0.026283788933778612],
 [0.05740501774971772, 0.05214609286433304, 0.02510966871856704],
 [0.03626130275474788, 0.02945685709070631, 0.028008637897122796],
 [0.025112717036490935, 0.024330266219235487, 0.021650811452954503],
 [0.03275350989916622, 0.028385483975969798, 0.02238758978754662],
 [0.03970866645777648, 0.03596378835673901, 0.018226979955979515],
 [0.045281035540934676, 0.030264366126329077, 0.02940146853112017],
 [0.022621877650546317, 0.018436905510209354, 0.012626858271881676],
 [0.04865064388020746, 0.026537827187008613, 0.021838105204411987],
 [0.012690952660123449, 0.010249949199206944, 0.008571568723658446],
 [0.0512523800623211, 0.027566755054238782, 0.021720509764465865],
 [0.0832672580865723, 0.0281462823696186, 0.023671991914385355],
 [0.0632965085195261, 0.03596378835673901, 0.028911686398659542],
 [0.05252547002171312, 0.024048530641768186, 0.021884857026661174],
 [0.07906941671999583, 0.06159617154299012, 0.05154569784473306],
 [0.03092249831082082, 0.02500362712556695, 0.024243625733129215],
 [0.04532092713176402, 0.0394676196235344, 0.021714147994139454],
 [0.04948513155835741, 0.04278197280575666, 0.03548209020205666]]

In [184]:
more_top_words = []
for idx_1 in range(20):
    for idx_2 in range(3):
        idx = corr_big_list_1[idx_1].index(top_3_list[idx_1][idx_2])
        more_top_words.append(out_1[idx_1][idx])

In [185]:
len(more_top_words)


Out[185]:
60

In [186]:
more_top_words_str = ', '.join(more_top_words)

In [187]:
more_top_words_str


Out[187]:
'university, list, With, physics?, shown, have, single, finding, around, mind, Indian, come, interesting, most, inspired, guy, movies, value, instead, most?, movies?, indian, girlfriend?, advice, across?, physical, cross, not, hate, Apple, actually, found, modern, technology, there, biggest, India?, each, but, within, physics, tell, intelligence, girlfriend, changes, sound, Glass?, iOS, mind-blowing, universe?, India, cases, right, Why, boyfriend, true, efforts, facts, girl, some'

In [188]:
CorrOR(more_top_words_str)


Out[188]:
(0.061155466257332765, 0.34819022465257771)

This is an improvement over the previous combination! What if we do only top 2?



In [190]:
more_top_words_ = [] #ran out of ideas for list names!
for idx_1 in range(20):
    for idx_2 in range(2):
        idx = corr_big_list_1[idx_1].index(top_3_list[idx_1][idx_2])
        more_top_words_.append(out_1[idx_1][idx])

In [193]:
len(more_top_words_)


Out[193]:
40

In [195]:
more_top_words_str_ = ', '.join(more_top_words_)

In [196]:
more_top_words_str_


Out[196]:
'university, list, physics?, shown, single, finding, mind, Indian, interesting, most, guy, movies, instead, most?, indian, girlfriend?, across?, physical, not, hate, actually, found, technology, there, India?, each, within, physics, intelligence, girlfriend, sound, Glass?, mind-blowing, universe?, cases, right, boyfriend, true, facts, girl'

In [197]:
CorrOR(more_top_words_str_)


Out[197]:
(0.066956432145978059, 0.33444324511112683)

Ok so not as great as top 3 word combinations. I will create a feature for the top 3 (and other features explored in this and the previous notebook) in the next notebook.

Number of Followers


In [202]:
data_df['topics']


Out[202]:
0               [{'followers': 500022, 'name': 'Movies'}]
1       [{'followers': 179, 'name': 'International Mat...
2              [{'followers': 614223, 'name': 'Science'}]
3       [{'followers': 614223, 'name': 'Science'}, {'f...
4        [{'followers': 1536, 'name': 'Android Tablets'}]
5       [{'followers': 91, 'name': 'Smartphone Applica...
6       [{'followers': 526597, 'name': 'Business'}, {'...
7       [{'followers': 289, 'name': 'Needs to Be Clear...
8       [{'followers': 59, 'name': 'Pacifism'}, {'foll...
9       [{'followers': 224, 'name': 'Boston Marathon T...
10      [{'followers': 321001, 'name': 'Entrepreneursh...
11      [{'followers': 199, 'name': 'Technology Indust...
12       [{'followers': 1240, 'name': 'U.S. Presidents'}]
13      [{'followers': 26, 'name': 'Deciding Whether t...
14      [{'followers': 3693, 'name': 'Medical Research...
15      [{'followers': 2049, 'name': 'Tablet Devices a...
16      [{'followers': 999, 'name': 'Luxury'}, {'follo...
17      [{'followers': 39744, 'name': 'Web Development'}]
18      [{'followers': 11842, 'name': 'Internet Advert...
19      [{'followers': 268430, 'name': 'Television Ser...
20      [{'followers': 167176, 'name': 'Business Model...
21      [{'followers': 490, 'name': 'Online Reputation...
22      [{'followers': 67265, 'name': 'Sex'}, {'follow...
23      [{'followers': 2525, 'name': 'Theology'}, {'fo...
24      [{'followers': 4332, 'name': 'Occupy Movement'...
25      [{'followers': 2450, 'name': 'Wanting and Maki...
26      [{'followers': 3567, 'name': 'Private Equity'}...
27            [{'followers': 238905, 'name': 'Facebook'}]
28            [{'followers': 238905, 'name': 'Facebook'}]
29            [{'followers': 986, 'name': 'Glenn Gould'}]
                              ...                        
8970     [{'followers': 3899, 'name': 'Quora (company)'}]
8971    [{'followers': 13, 'name': 'Flame Effects'}, {...
8972    [{'followers': 39744, 'name': 'Web Development'}]
8973    [{'followers': 264984, 'name': 'Dating and Rel...
8974    [{'followers': 1460, 'name': 'Numbers'}, {'fol...
8975          [{'followers': 7928, 'name': 'Recruiting'}]
8976            [{'followers': 4729, 'name': 'Software'}]
8977             [{'followers': 93447, 'name': 'Amazon'}]
8978    [{'followers': 4859, 'name': 'Sales'}, {'follo...
8979      [{'followers': 13108, 'name': 'Career Advice'}]
8980          [{'followers': 3, 'name': 'Done Genetics'}]
8981    [{'followers': 6467, 'name': 'Colleges and Uni...
8982    [{'followers': 289, 'name': 'Needs to Be Clear...
8983          [{'followers': 238905, 'name': 'Facebook'}]
8984       [{'followers': 809, 'name': 'Auto Insurance'}]
8985    [{'followers': 590279, 'name': 'Books'}, {'fol...
8986    [{'followers': 1, 'name': 'User Data'}, {'foll...
8987              [{'followers': 361, 'name': 'Royalty'}]
8988    [{'followers': 24985, 'name': 'Innovation'}, {...
8989    [{'followers': 12423, 'name': 'Japan'}, {'foll...
8990    [{'followers': 11233, 'name': 'Life Advice'}, ...
8991    [{'followers': 1483, 'name': 'Bitly'}, {'follo...
8992    [{'followers': 295, 'name': 'Curiosity (Mars R...
8993             [{'followers': 6423, 'name': 'Germany'}]
8994    [{'followers': 854, 'name': 'Syria'}, {'follow...
8995    [{'followers': 10123, 'name': 'Animals'}, {'fo...
8996    [{'followers': 7149, 'name': 'iPad Application...
8997    [{'followers': 18741, 'name': 'The Hobbit (193...
8998         [{'followers': 7260, 'name': 'Real Estate'}]
8999           [{'followers': 199773, 'name': 'Physics'}]
Name: topics, Length: 9000, dtype: object

In [266]:
def funn(x): #where x will be a row when running `apply`
    return sum(x[i]['followers'] for i in range(len(x)))
data_df['topics_followers'] = data_df['topics'].apply(funn)
data_df.drop(['topics'], axis =1, inplace=True)

We take the sum of the followers of each topic it appears under because, naturally, the chances of a question being viewed increases with the number of topics it appears under, garnering a wider audience.


In [268]:
data_df['topics_followers'].corr(data_df.__ans__)


Out[268]:
0.12134242287111571

In [269]:
temp = data_df['topics_followers'] * data_df['anonymous']
temp.corr(data_df.__ans__)


Out[269]:
0.056418286033386875

In [270]:
temp = data_df['topics_followers'] * data_df['context_topic'].apply(lambda x: 0 if x==None else 1)
temp.corr(data_df.__ans__)


Out[270]:
0.11789574948786086

In [274]:
temp = data_df['topics_followers'] * data_df['num_answers'].apply(lambda x: 1 if x>=29 else 0)
temp.corr(data_df.__ans__)


Out[274]:
0.26310761648405595

In [276]:
temp_0 = data_df.question_text.apply(lambda x: len(x)) #len of characters
temp = data_df['topics_followers'] * temp_0.apply(lambda x: 1 if x <= 182 else 0)
temp.corr(data_df.__ans__)


Out[276]:
0.12145956730654207