In [35]:
library(dplyr)
In [ ]:
entity_project_page <- read.table("../results/sql_queries/entity_project_pages.tsv", header=FALSE, sep="\t")
In [37]:
colnames(entity_project_page) <- c('entity_id','project','page_id')
In [38]:
nrow(entity_project_page)
In [39]:
sorted_page_frequency_by_entity <- dplyr::arrange(count(entity_project_page,entity_id), desc(n))
In [40]:
summary(sorted_page_frequency_by_entity)
In [41]:
sd(sorted_page_frequency_by_entity$n)
In [42]:
head(sorted_page_frequency_by_entity)
In [43]:
hist(log2(sorted_page_frequency_by_entity$n),xlab="Log of Page Usages", main="Distribution of Page Usages")
In [44]:
head(sorted_page_frequency_by_entity,25)
In [45]:
entities_used_by_one_page <- subset(sorted_page_frequency_by_entity, n==1)
In [46]:
nrow(entities_used_by_one_page)
In [47]:
nrow(entities_used_by_one_page)/nrow(sorted_page_frequency_by_entity)
In [48]:
entities_used_by_nine_or_less_pages <- subset(sorted_page_frequency_by_entity, n<=9)
In [49]:
nrow(entities_used_by_nine_or_less_pages)
In [50]:
nrow(entities_used_by_nine_or_less_pages)/nrow(sorted_page_frequency_by_entity)
In [51]:
page_frequency_by_entity_and_project <- count(entity_project_page,entity_id, project)
In [52]:
entities_and_projects <- page_frequency_by_entity_and_project[c(1,2)]
In [53]:
sorted_project_frequency_by_entity <- dplyr::arrange(count(entities_and_projects,entity_id), desc(n))
In [54]:
summary(sorted_project_frequency_by_entity)
In [55]:
sd(sorted_project_frequency_by_entity$n)
In [56]:
hist(log2(sorted_project_frequency_by_entity$n),xlab="Log of Project Usages", main="Distribution of Project Usages")
In [75]:
head(sorted_project_frequency_by_entity,100)
In [58]:
entities_used_by_one_project <- subset(sorted_project_frequency_by_entity, n==1)
In [59]:
nrow(entities_used_by_one_project)
In [60]:
nrow(entities_used_by_one_project)/nrow(sorted_project_frequency_by_entity)
In [61]:
male_item_pages <- filter(sorted_page_frequency_by_entity, entity_id=="Q6581097")
In [62]:
head(male_item_pages)
In [63]:
female_item_pages <-filter(sorted_page_frequency_by_entity, entity_id=="Q6581072")
In [64]:
head(female_item_pages)
In [65]:
female_item_pages$n/male_item_pages$n
In [66]:
colnames(sorted_page_frequency_by_entity) <- c('entity_id','m')
In [67]:
project_and_page_frequency_by_entity <- merge(sorted_page_frequency_by_entity,sorted_project_frequency_by_entity)
In [73]:
head(project_and_page_frequency_by_entity, n= 100)
In [69]:
two_or_more_pages_project_and_page_frequency_by_entity <- filter(project_and_page_frequency_by_entity, m >= 2)
In [74]:
head(two_or_more_pages_project_and_page_frequency_by_entity, n=100)
In [76]:
two_or_more_projects_two_or_more_pages_project_and_page_frequency_by_entity <- filter(two_or_more_pages_project_and_page_frequency_by_entity, n >=2)
In [77]:
nrow(two_or_more_projects_two_or_more_pages_project_and_page_frequency_by_entity)/nrow(two_or_more_pages_project_and_page_frequency_by_entity)
In [ ]: