In [47]:
%matplotlib inline
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
In [48]:
df = pd.read_pickle("../data/dataset.pkl")
df = df.drop("watchers", 1)
df = df.drop("network", 1)
df = df[df["code_lines"] > 100]
df = df[df["code_classes"] > 1]
df = df[df["code_functions"] > 1]
In [49]:
df["test_lines"].sum() / df["code_lines"].sum()
Out[49]:
In [50]:
df["test_classes"].sum() / df["code_classes"].sum()
Out[50]:
In [51]:
df["test_functions"].sum() / df["code_functions"].sum()
Out[51]:
In [52]:
df["line_ratio"] = df["test_lines"] / df["code_lines"]
df["class_ratio"] = df["test_classes"] / df["code_classes"]
df["function_ratio"] = df["test_functions"] / df["code_functions"]
In [53]:
df.describe()
Out[53]:
In [54]:
df[df["language"] == "Python"].mean()
Out[54]:
In [55]:
df[df["language"] == "Java"].mean()
Out[55]:
In [56]:
python = df[df["language"] == "Python"]
java = df[df["language"] == "Java"]
In [57]:
x = [[python["code_lines"].mean(), java["code_lines"].mean()], [python["test_lines"].mean(), java["test_lines"].mean()]]
df2 = pd.DataFrame(x, columns=["Python", "Java"])
df2.plot.bar();
In [69]:
x = [[python["line_ratio"].mean(), java["line_ratio"].mean()], [python["class_ratio"].mean(), java["class_ratio"].mean()], [python["function_ratio"].mean(), java["function_ratio"].mean()]]
df2 = pd.DataFrame(x, columns=["Python", "Java"])
df2.plot.bar();
In [72]:
df.plot(kind="scatter", x="line_ratio", y="forks", xlim=(0,2), ylim=(0,2000))
Out[72]:
In [78]:
python_no_test = len(python[python["test_lines"] == 0]) / len(python)
java_no_test = len(java[java["test_lines"] == 0]) / len(java)
x = [[python_no_test, java_no_test]]
no_test = pd.DataFrame(x, columns=["Python", "Java"])
no_test.plot(kind="bar")
Out[78]:
In [ ]: