notebook.community

Edit and run



In [3]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [44]:

    
import pandas as pd



In [49]:

    
TOPIC_FILE = r'topics\2.txt'

def read_topic(path):
    df = pd.read_csv(path, header=None, sep=';')
    df.columns = ['term', 'score']
    return df

df = read_topic(TOPIC_FILE)



In [18]:

    
df.score.plot()









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x1f30cb9bfd0>



In [ ]:

    
# To get to the elbow, draw a line between the top right and bottom left corner and 
# get the point that is farthest away from that curve.



In [19]:

    
from numpy import cross, subtract
from numpy.linalg import norm

def distance(p, a, b):
    return norm(cross(subtract(b, a), subtract(a, p)))  / norm (subtract(b, a))



In [87]:

    
def map_distance(series):
    # Line:
    bottom_left = (0, series.min())
    top_right = (len(series) - 1, series.max())
    return pd.Series(distance(p, bottom_left, top_right) for p in enumerate(series))
    
df['distance'] = map_distance(df.score)



In [122]:

    
elbow_score = df.score[df.distance == df.distance.max()]
#elbow_score
df









    Out[122]:







  
    
      
      term
      score
      d
      distance
    
  
  
    
      0
      management
      5.497831
      0.000000
      0.000000
    
    
      1
      advantage
      5.522715
      0.753326
      0.753326
    
    
      2
      computing
      5.861587
      1.306022
      1.306022
    
    
      3
      usa
      5.921815
      2.036765
      2.036765
    
    
      4
      designed
      5.942556
      2.792738
      2.792738
    
    
      5
      technical
      6.029256
      3.506566
      3.506566
    
    
      6
      requirements
      6.274679
      4.118973
      4.118973
    
    
      7
      consists
      6.439008
      4.783198
      4.783198
    
    
      8
      supported
      6.502008
      5.512169
      5.512169
    
    
      9
      currently
      6.627805
      6.201015
      6.201015
    
    
      10
      architecture
      6.930092
      6.777087
      6.777087
    
    
      11
      25
      6.957091
      7.529062
      7.529062
    
    
      12
      including
      7.016737
      8.260176
      8.260176
    
    
      13
      better
      7.032905
      9.019072
      9.019072
    
    
      14
      introduction
      7.047326
      9.779084
      9.779084
    
    
      15
      includes
      7.061765
      10.539085
      10.539085
    
    
      16
      approaches
      7.082910
      11.294801
      11.294801
    
    
      17
      19
      7.098590
      12.054008
      12.054008
    
    
      18
      needs
      7.132538
      12.801543
      12.801543
    
    
      19
      small
      7.178708
      13.541268
      13.541268
    
    
      20
      18
      7.337569
      14.208987
      14.208987
    
    
      21
      developed
      7.406332
      14.934275
      14.934275
    
    
      22
      known
      7.463581
      15.666922
      15.666922
    
    
      23
      report
      7.477530
      16.427235
      16.427235
    
    
      24
      high
      7.608263
      17.112926
      17.112926
    
    
      25
      real
      7.688692
      17.830761
      17.830761
    
    
      26
      able
      7.722767
      18.578215
      18.578215
    
    
      27
      future
      7.823149
      19.283300
      19.283300
    
    
      28
      international
      7.840851
      20.041216
      20.041216
    
    
      29
      hand
      7.943110
      20.745102
      20.745102
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      202
      state
      31.348638
      138.865761
      138.865761
    
    
      203
      model
      31.819085
      139.334383
      139.334383
    
    
      204
      defined
      32.213447
      139.851622
      139.851622
    
    
      205
      time
      32.244143
      140.601235
      140.601235
    
    
      206
      languages
      33.423580
      140.616830
      140.616830
    
    
      207
      interface
      33.892472
      141.086446
      141.086446
    
    
      208
      using
      36.230775
      140.361553
      140.361553
    
    
      209
      software
      36.310578
      141.079787
      141.079787
    
    
      210
      program
      37.033587
      141.387029
      141.387029
    
    
      211
      use
      39.115746
      140.825806
      140.825806
    
    
      212
      set
      40.234591
      140.880118
      140.880118
    
    
      213
      used
      43.294364
      139.694223
      139.694223
    
    
      214
      implementation
      43.322922
      140.445202
      140.445202
    
    
      215
      example
      45.788073
      139.639257
      139.639257
    
    
      216
      programming
      48.426526
      138.722576
      138.722576
    
    
      217
      figure
      51.374533
      137.608097
      137.608097
    
    
      218
      design
      52.107396
      137.909042
      137.909042
    
    
      219
      instance
      52.113113
      138.674616
      138.674616
    
    
      220
      language
      52.923953
      138.925736
      138.925736
    
    
      221
      type
      53.677647
      139.213370
      139.213370
    
    
      222
      new
      53.719063
      139.956133
      139.956133
    
    
      223
      oriented
      56.284259
      139.086261
      139.086261
    
    
      224
      objects
      58.511591
      138.432277
      138.432277
    
    
      225
      code
      68.511984
      132.811493
      132.811493
    
    
      226
      java
      78.136335
      127.430991
      127.430991
    
    
      227
      methods
      95.358946
      117.195385
      117.195385
    
    
      228
      classes
      130.463778
      95.533470
      95.533470
    
    
      229
      object
      155.163360
      80.520258
      80.520258
    
    
      230
      method
      156.192224
      80.632066
      80.632066
    
    
      231
      class
      283.585592
      0.000000
      0.000000
    
  

232 rows × 4 columns



In [136]:

    
import seaborn as sns
sns.regplot(df.score, df.distance)









    Out[136]:





<matplotlib.axes._subplots.AxesSubplot at 0x1f310878d68>



In [ ]:

    
# Identify relevant topics based on their 
# summed scores



In [120]:

    
import glob
topics = glob.glob(r'topics\*.txt')

frames = [(topic, read_topic(topic)) for topic in topics]
topic_scores = pd.DataFrame(
     (topic, frame.score.sum()) for topic, frame in frames
)
topic_scores.columns = ['topic', 'score']

# Filters all topics that are below the elbow. 
# The elbow marks the spot under which topics become irrelevant. 
topic_scores['distance'] = map_distance(topic_scores.score)
elbow_score = topic_scores.score[topic_scores.distance == topic_scores.distance.max()].iloc[0]
topic_scores[topic_scores.score > elbow_score]









    Out[120]:







  
    
      
      topic
      score
      distance
    
  
  
    
      0
      topics\0.txt
      63.051016
      0.083780
    
    
      4
      topics\12.txt
      6708.257388
      7.783789
    
    
      5
      topics\13.txt
      7966.981343
      8.999986
    
    
      7
      topics\2.txt
      4828.603411
      1.474347
    
    
      8
      topics\3.txt
      3532.186073
      1.808212
    
    
      10
      topics\5.txt
      5099.310076
      1.049024
    
    
      12
      topics\7.txt
      7668.780121
      1.474964
    
    
      13
      topics\8.txt
      4650.181029
      4.839786
    
    
      14
      topics\9.txt
      4310.782819
      6.437352

	term	score	d	distance
0	management	5.497831	0.000000	0.000000
1	advantage	5.522715	0.753326	0.753326
2	computing	5.861587	1.306022	1.306022
3	usa	5.921815	2.036765	2.036765
4	designed	5.942556	2.792738	2.792738
5	technical	6.029256	3.506566	3.506566
6	requirements	6.274679	4.118973	4.118973
7	consists	6.439008	4.783198	4.783198
8	supported	6.502008	5.512169	5.512169
9	currently	6.627805	6.201015	6.201015
10	architecture	6.930092	6.777087	6.777087
11	25	6.957091	7.529062	7.529062
12	including	7.016737	8.260176	8.260176
13	better	7.032905	9.019072	9.019072
14	introduction	7.047326	9.779084	9.779084
15	includes	7.061765	10.539085	10.539085
16	approaches	7.082910	11.294801	11.294801
17	19	7.098590	12.054008	12.054008
18	needs	7.132538	12.801543	12.801543
19	small	7.178708	13.541268	13.541268
20	18	7.337569	14.208987	14.208987
21	developed	7.406332	14.934275	14.934275
22	known	7.463581	15.666922	15.666922
23	report	7.477530	16.427235	16.427235
24	high	7.608263	17.112926	17.112926
25	real	7.688692	17.830761	17.830761
26	able	7.722767	18.578215	18.578215
27	future	7.823149	19.283300	19.283300
28	international	7.840851	20.041216	20.041216
29	hand	7.943110	20.745102	20.745102
...	...	...	...	...
202	state	31.348638	138.865761	138.865761
203	model	31.819085	139.334383	139.334383
204	defined	32.213447	139.851622	139.851622
205	time	32.244143	140.601235	140.601235
206	languages	33.423580	140.616830	140.616830
207	interface	33.892472	141.086446	141.086446
208	using	36.230775	140.361553	140.361553
209	software	36.310578	141.079787	141.079787
210	program	37.033587	141.387029	141.387029
211	use	39.115746	140.825806	140.825806
212	set	40.234591	140.880118	140.880118
213	used	43.294364	139.694223	139.694223
214	implementation	43.322922	140.445202	140.445202
215	example	45.788073	139.639257	139.639257
216	programming	48.426526	138.722576	138.722576
217	figure	51.374533	137.608097	137.608097
218	design	52.107396	137.909042	137.909042
219	instance	52.113113	138.674616	138.674616
220	language	52.923953	138.925736	138.925736
221	type	53.677647	139.213370	139.213370
222	new	53.719063	139.956133	139.956133
223	oriented	56.284259	139.086261	139.086261
224	objects	58.511591	138.432277	138.432277
225	code	68.511984	132.811493	132.811493
226	java	78.136335	127.430991	127.430991
227	methods	95.358946	117.195385	117.195385
228	classes	130.463778	95.533470	95.533470
229	object	155.163360	80.520258	80.520258
230	method	156.192224	80.632066	80.632066
231	class	283.585592	0.000000	0.000000

	topic	score	distance
0	topics\0.txt	63.051016	0.083780
4	topics\12.txt	6708.257388	7.783789
5	topics\13.txt	7966.981343	8.999986
7	topics\2.txt	4828.603411	1.474347
8	topics\3.txt	3532.186073	1.808212
10	topics\5.txt	5099.310076	1.049024
12	topics\7.txt	7668.780121	1.474964
13	topics\8.txt	4650.181029	4.839786
14	topics\9.txt	4310.782819	6.437352