Einleitung

  • Hallo
  • World

In [ ]:
import pandas as pd
git_blame = pd.read_csv("../dataset/linux_blame_log.csv")
git_blame.head()

In [ ]:
git_blame.info(memory_usage="deep")

In [ ]:
git_blame.timestamp = pd.to_datetime(git_blame.timestamp)
git_blame.path = pd.Categorical(git_blame.path)
git_blame.author = pd.Categorical(git_blame.author)
git_blame.info(memory_usage="deep")

In [ ]:
%matplotlib inline
git_blame.author.value_counts().head(10).plot.pie()

Wissensträger


In [ ]:
a_year_ago = pd.Timestamp("today") - pd.DateOffset(years=1)
a_year_ago

In [ ]:
git_blame['knowing'] = git_blame.timestamp >= a_year_ago
git_blame.head()

In [ ]:
knowledge_carrier = git_blame[git_blame.knowing]
knowledge_carrier.head()

In [ ]:
(knowledge_carrier.author.value_counts() / len(knowledge_carrier)).head(10)

Wissenslücken im Code


In [ ]:
git_blame.path.value_counts().head()

In [ ]:
git_blame['component'] = git_blame.path.str.split("/").str[:2].str.join(":")
git_blame.head()

In [ ]:
git_blame.component.value_counts();

In [ ]:
knowledge_per_component = git_blame.groupby('component')[['knowing']].mean()
knowledge_per_component.head()

In [ ]:
knowledge_per_component.knowing.sort_values().plot.barh(figsize=[5,20])