In [1]:
library(dplyr)
In [ ]:
entity_project_page <- read.table("../results/sql_queries/entity_project_page_views.tsv", header=FALSE, sep="\t")
In [ ]:
entity_views <- read.table("../results/sql_queries/entity_views.tsv", header=FALSE, sep="\t")
In [4]:
entity_project_page_views <- entity_project_page
In [5]:
colnames(entity_project_page_views) <- c('entity_id','project','page_id','page_views')
In [6]:
summary(entity_project_page_views)
In [7]:
page_frequency_by_entity <- count(entity_project_page_views,entity_id)
In [8]:
colnames(page_frequency_by_entity) <- c('entity_id','page_usages')
In [9]:
summary(page_frequency_by_entity)
In [10]:
colnames(entity_views) <- c('entity_id','page_views')
In [11]:
entity_views_usage_frequency <- merge(entity_views, page_frequency_by_entity, by = "entity_id")
In [12]:
summary(entity_views_usage_frequency)
In [13]:
colnames(entity_views_usage_frequency) <- c('entity_id','page_views','page_usages')
In [14]:
head(entity_views_usage_frequency)
In [15]:
cor(entity_views_usage_frequency$page_views,entity_views_usage_frequency$page_usages, method="spearman")
In [16]:
page_view_and_usage_model <- lm(entity_views_usage_frequency$page_views ~ entity_views_usage_frequency$page_usages)
In [17]:
summary(page_view_and_usage_model)
In [18]:
entity_view_means <- aggregate(entity_project_page_views$page_views, by=list(entity_project_page_views$entity_id), FUN=mean)
In [19]:
summary(entity_view_means)
In [20]:
colnames(entity_view_means) <- c('entity_id','page_view_mean')
In [21]:
entity_view_means_and_usages <- merge(page_frequency_by_entity, entity_view_means, by = "entity_id")
In [22]:
rm(page_frequency_by_entity)
In [23]:
rm(entity_view_means)
In [24]:
head(entity_view_means_and_usages)
In [25]:
cor(entity_view_means_and_usages$page_usages,entity_view_means_and_usages$page_view_mean, method="spearman")
In [26]:
entity_view_means_and_usages_model <- lm(entity_view_means_and_usages$page_usages ~ entity_view_means_and_usages$page_view_mean)
In [27]:
summary(entity_view_means_and_usages_model)
In [28]:
sorted_by_highest_view_means <- dplyr::arrange(entity_view_means_and_usages, desc(page_view_mean))
In [29]:
head(sorted_by_highest_view_means, n=10)
In [30]:
view_means_of_zero <-filter(entity_view_means_and_usages, page_view_mean == 0)
In [31]:
view_means_of_zero_sorted_by_page_usages <- dplyr::arrange(view_means_of_zero, desc(page_usages))
In [32]:
head(view_means_of_zero_sorted_by_page_usages, n=10)
In [33]:
used_once <- filter(entity_view_means_and_usages, page_usages == 1 )
In [34]:
used_more_than_once <- filter(entity_view_means_and_usages, page_usages > 0 )
In [35]:
mean(used_once$page_view_mean)
In [36]:
mean(used_more_than_once$page_view_mean)
In [37]:
property_project_page_views <- filter(entity_project_page_views, substr(entity_id,1,1) == "P")
In [38]:
property_project_page_views_for_client_wikidata <- filter(property_project_page_views, project == "wikidatawiki")
In [39]:
head(property_project_page_views_for_client_wikidata)
In [40]:
nrow(property_project_page_views)
In [41]:
nrow(property_project_page_views_for_client_wikidata)
In [42]:
nrow(property_project_page_views_for_client_wikidata)/nrow(property_project_page_views)
In [43]:
page_frequency_by_property <- count(property_project_page_views,entity_id)
In [44]:
wikidata_page_frequency_by_property <- count(property_project_page_views_for_client_wikidata, entity_id)
In [45]:
colnames(page_frequency_by_property) <- c('property_id','page_usages')
In [46]:
colnames(wikidata_page_frequency_by_property) <- c('property_id','wikidata_page_usages')
In [47]:
page_frequency_by_property_merged <- merge(wikidata_page_frequency_by_property,page_frequency_by_property)
In [48]:
page_frequency_by_property_merged$wikidata_average <- page_frequency_by_property_merged$wikidata_page_usages/page_frequency_by_property_merged$page_usages
In [49]:
head(page_frequency_by_property_merged)
In [50]:
summary(page_frequency_by_property_merged$wikidata_average)
In [51]:
hist(page_frequency_by_property_merged$wikidata_average)
In [52]:
page_frequency_by_property_merged_sorted_by_wikidata_average <- dplyr::arrange(page_frequency_by_property_merged, wikidata_average)
In [53]:
head(page_frequency_by_property_merged_sorted_by_wikidata_average, n=25)
In [54]:
head(property_project_page_views_for_client_wikidata)
In [55]:
property_project_page_views_for_client_wikidata_summed <- aggregate(property_project_page_views_for_client_wikidata$page_views, by=list(property_project_page_views_for_client_wikidata$entity_id), FUN=sum)
In [70]:
property_project_page_views_summed <- aggregate(property_project_page_views$page_views, by=list(property_project_page_views$entity_id), FUN=sum)
In [73]:
colnames(property_project_page_views_summed) <- c('property_id','page_views')
In [74]:
head(property_project_page_views_summed)
In [75]:
colnames(property_project_page_views_for_client_wikidata_summed) <- c('property_id','wikidata_page_views')
In [76]:
head(property_project_page_views_for_client_wikidata_summed)
In [94]:
property_summed_views <- merge(property_project_page_views_for_client_wikidata_summed,property_project_page_views_summed)
In [100]:
property_summed_views$non_wikidata_page_views <- property_summed_views$page_views - property_summed_views$wikidata_page_views
In [101]:
page_frequency_by_property_merged_and_summed_views <- merge(property_summed_views,page_frequency_by_property_merged)
In [102]:
nrow(page_frequency_by_property_merged_and_summed_views)
In [105]:
page_frequency_by_property_merged_and_summed_views_sorted_by_page_views <- dplyr::arrange(page_frequency_by_property_merged_and_summed_views, desc(non_wikidata_page_views))
In [107]:
page_frequency_by_property_merged_and_summed_views_sorted_by_page_views$non_wikidata_page_usages <- page_frequency_by_property_merged_and_summed_views_sorted_by_page_views$page_usages - page_frequency_by_property_merged_and_summed_views_sorted_by_page_views$wikidata_page_usages
In [108]:
head(page_frequency_by_property_merged_and_summed_views_sorted_by_page_views, n=25)
In [ ]: