In [3]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [44]:
import pandas as pd

In [49]:
TOPIC_FILE = r'topics\2.txt'

def read_topic(path):
    df = pd.read_csv(path, header=None, sep=';')
    df.columns = ['term', 'score']
    return df

df = read_topic(TOPIC_FILE)

In [18]:
df.score.plot()


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f30cb9bfd0>

In [ ]:
# To get to the elbow, draw a line between the top right and bottom left corner and 
# get the point that is farthest away from that curve.

In [19]:
from numpy import cross, subtract
from numpy.linalg import norm

def distance(p, a, b):
    return norm(cross(subtract(b, a), subtract(a, p)))  / norm (subtract(b, a))

In [87]:
def map_distance(series):
    # Line:
    bottom_left = (0, series.min())
    top_right = (len(series) - 1, series.max())
    return pd.Series(distance(p, bottom_left, top_right) for p in enumerate(series))
    
df['distance'] = map_distance(df.score)

In [122]:
elbow_score = df.score[df.distance == df.distance.max()]
#elbow_score
df


Out[122]:
term score d distance
0 management 5.497831 0.000000 0.000000
1 advantage 5.522715 0.753326 0.753326
2 computing 5.861587 1.306022 1.306022
3 usa 5.921815 2.036765 2.036765
4 designed 5.942556 2.792738 2.792738
5 technical 6.029256 3.506566 3.506566
6 requirements 6.274679 4.118973 4.118973
7 consists 6.439008 4.783198 4.783198
8 supported 6.502008 5.512169 5.512169
9 currently 6.627805 6.201015 6.201015
10 architecture 6.930092 6.777087 6.777087
11 25 6.957091 7.529062 7.529062
12 including 7.016737 8.260176 8.260176
13 better 7.032905 9.019072 9.019072
14 introduction 7.047326 9.779084 9.779084
15 includes 7.061765 10.539085 10.539085
16 approaches 7.082910 11.294801 11.294801
17 19 7.098590 12.054008 12.054008
18 needs 7.132538 12.801543 12.801543
19 small 7.178708 13.541268 13.541268
20 18 7.337569 14.208987 14.208987
21 developed 7.406332 14.934275 14.934275
22 known 7.463581 15.666922 15.666922
23 report 7.477530 16.427235 16.427235
24 high 7.608263 17.112926 17.112926
25 real 7.688692 17.830761 17.830761
26 able 7.722767 18.578215 18.578215
27 future 7.823149 19.283300 19.283300
28 international 7.840851 20.041216 20.041216
29 hand 7.943110 20.745102 20.745102
... ... ... ... ...
202 state 31.348638 138.865761 138.865761
203 model 31.819085 139.334383 139.334383
204 defined 32.213447 139.851622 139.851622
205 time 32.244143 140.601235 140.601235
206 languages 33.423580 140.616830 140.616830
207 interface 33.892472 141.086446 141.086446
208 using 36.230775 140.361553 140.361553
209 software 36.310578 141.079787 141.079787
210 program 37.033587 141.387029 141.387029
211 use 39.115746 140.825806 140.825806
212 set 40.234591 140.880118 140.880118
213 used 43.294364 139.694223 139.694223
214 implementation 43.322922 140.445202 140.445202
215 example 45.788073 139.639257 139.639257
216 programming 48.426526 138.722576 138.722576
217 figure 51.374533 137.608097 137.608097
218 design 52.107396 137.909042 137.909042
219 instance 52.113113 138.674616 138.674616
220 language 52.923953 138.925736 138.925736
221 type 53.677647 139.213370 139.213370
222 new 53.719063 139.956133 139.956133
223 oriented 56.284259 139.086261 139.086261
224 objects 58.511591 138.432277 138.432277
225 code 68.511984 132.811493 132.811493
226 java 78.136335 127.430991 127.430991
227 methods 95.358946 117.195385 117.195385
228 classes 130.463778 95.533470 95.533470
229 object 155.163360 80.520258 80.520258
230 method 156.192224 80.632066 80.632066
231 class 283.585592 0.000000 0.000000

232 rows × 4 columns


In [136]:
import seaborn as sns
sns.regplot(df.score, df.distance)


Out[136]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f310878d68>

In [ ]:
# Identify relevant topics based on their 
# summed scores

In [120]:
import glob
topics = glob.glob(r'topics\*.txt')

frames = [(topic, read_topic(topic)) for topic in topics]
topic_scores = pd.DataFrame(
     (topic, frame.score.sum()) for topic, frame in frames
)
topic_scores.columns = ['topic', 'score']

# Filters all topics that are below the elbow. 
# The elbow marks the spot under which topics become irrelevant. 
topic_scores['distance'] = map_distance(topic_scores.score)
elbow_score = topic_scores.score[topic_scores.distance == topic_scores.distance.max()].iloc[0]
topic_scores[topic_scores.score > elbow_score]


Out[120]:
topic score distance
0 topics\0.txt 63.051016 0.083780
4 topics\12.txt 6708.257388 7.783789
5 topics\13.txt 7966.981343 8.999986
7 topics\2.txt 4828.603411 1.474347
8 topics\3.txt 3532.186073 1.808212
10 topics\5.txt 5099.310076 1.049024
12 topics\7.txt 7668.780121 1.474964
13 topics\8.txt 4650.181029 4.839786
14 topics\9.txt 4310.782819 6.437352