In [3]:
%pylab inline
In [44]:
import pandas as pd
In [49]:
TOPIC_FILE = r'topics\2.txt'
def read_topic(path):
df = pd.read_csv(path, header=None, sep=';')
df.columns = ['term', 'score']
return df
df = read_topic(TOPIC_FILE)
In [18]:
df.score.plot()
Out[18]:
In [ ]:
# To get to the elbow, draw a line between the top right and bottom left corner and
# get the point that is farthest away from that curve.
In [19]:
from numpy import cross, subtract
from numpy.linalg import norm
def distance(p, a, b):
return norm(cross(subtract(b, a), subtract(a, p))) / norm (subtract(b, a))
In [87]:
def map_distance(series):
# Line:
bottom_left = (0, series.min())
top_right = (len(series) - 1, series.max())
return pd.Series(distance(p, bottom_left, top_right) for p in enumerate(series))
df['distance'] = map_distance(df.score)
In [122]:
elbow_score = df.score[df.distance == df.distance.max()]
#elbow_score
df
Out[122]:
In [136]:
import seaborn as sns
sns.regplot(df.score, df.distance)
Out[136]:
In [ ]:
# Identify relevant topics based on their
# summed scores
In [120]:
import glob
topics = glob.glob(r'topics\*.txt')
frames = [(topic, read_topic(topic)) for topic in topics]
topic_scores = pd.DataFrame(
(topic, frame.score.sum()) for topic, frame in frames
)
topic_scores.columns = ['topic', 'score']
# Filters all topics that are below the elbow.
# The elbow marks the spot under which topics become irrelevant.
topic_scores['distance'] = map_distance(topic_scores.score)
elbow_score = topic_scores.score[topic_scores.distance == topic_scores.distance.max()].iloc[0]
topic_scores[topic_scores.score > elbow_score]
Out[120]: