Statistics for Entity Page Usage


In [35]:
library(dplyr)

In [ ]:
entity_project_page <- read.table("../results/sql_queries/entity_project_pages.tsv", header=FALSE, sep="\t")

In [37]:
colnames(entity_project_page) <- c('entity_id','project','page_id')

In [38]:
nrow(entity_project_page)


160803457

Page Frequency by Entity


In [39]:
sorted_page_frequency_by_entity <- dplyr::arrange(count(entity_project_page,entity_id), desc(n))

In [40]:
summary(sorted_page_frequency_by_entity)


   entity_id              n            
 P1     :       1   Min.   :      1.0  
 P10    :       1   1st Qu.:      1.0  
 P100   :       1   Median :      1.0  
 P1000  :       1   Mean   :      7.2  
 P10000 :       1   3rd Qu.:      3.0  
 P1001  :       1   Max.   :2451812.0  
 (Other):22250015                      

In [41]:
sd(sorted_page_frequency_by_entity$n)


1481.25824008857

In [42]:
head(sorted_page_frequency_by_entity)


entity_idn
Q54919 2451812
Q423048 1968766
Q2597810 1937505
Q131454 1924192
Q132194541924125
Q36578 1917354

In [43]:
hist(log2(sorted_page_frequency_by_entity$n),xlab="Log of Page Usages", main="Distribution of Page Usages")



In [44]:
head(sorted_page_frequency_by_entity,25)


entity_idn
Q54919 2451812
Q423048 1968766
Q2597810 1937505
Q131454 1924192
Q132194541924125
Q36578 1917354
Q193563 1595632
Q199389121467497
P373 1331718
P856 1169632
Q2494649 898817
Q623578 836262
Q148 824605
P18 764223
P281 723276
Q6581097 722871
Q17299517 721518
P17 671831
P625 634310
Q55 634274
Q5 611464
Q758610 593937
Q1967876 574405
P571 550523
Q750403 550193

Entities used by one page


In [45]:
entities_used_by_one_page <- subset(sorted_page_frequency_by_entity, n==1)

In [46]:
nrow(entities_used_by_one_page)


12107912

In [47]:
nrow(entities_used_by_one_page)/nrow(sorted_page_frequency_by_entity)


0.544175306621059

Entities used by nine or less pages


In [48]:
entities_used_by_nine_or_less_pages <- subset(sorted_page_frequency_by_entity, n<=9)

In [49]:
nrow(entities_used_by_nine_or_less_pages)


21215568

In [50]:
nrow(entities_used_by_nine_or_less_pages)/nrow(sorted_page_frequency_by_entity)


0.953507774217382

Project Frequency by Entity


In [51]:
page_frequency_by_entity_and_project <- count(entity_project_page,entity_id, project)

In [52]:
entities_and_projects <- page_frequency_by_entity_and_project[c(1,2)]

In [53]:
sorted_project_frequency_by_entity <- dplyr::arrange(count(entities_and_projects,entity_id), desc(n))

In [54]:
summary(sorted_project_frequency_by_entity)


   entity_id              n         
 P1     :       1   Min.   :  1.00  
 P10    :       1   1st Qu.:  1.00  
 P100   :       1   Median :  1.00  
 P1000  :       1   Mean   :  2.66  
 P10000 :       1   3rd Qu.:  2.00  
 P1001  :       1   Max.   :611.00  
 (Other):22250015                   

In [55]:
sd(sorted_project_frequency_by_entity$n)


5.43701787213839

In [56]:
hist(log2(sorted_project_frequency_by_entity$n),xlab="Log of Project Usages", main="Distribution of Project Usages")



In [75]:
head(sorted_project_frequency_by_entity,100)


entity_idn
Q4847311 611
Q20739451594
Q5296 591
Q4299475 492
Q4844001 483
Q4039395 435
Q4608595 429
Q5461620 420
Q1281 412
Q3740 412
Q16503 402
Q5626526 396
Q5400303 378
Q5964 378
Q21042795362
Q30 358
Q5544939 358
Q5626683 358
Q6395874 356
Q159 355
Q5547114 355
Q183 354
Q142 350
Q5543187 349
Q6398355 348
Q6398678 347
Q4655215 346
Q1457903 345
Q21042800344
Q5462387 344
Q212 311
Q5541627 310
Q5544913 310
Q5626440 310
Q90 310
Q1845 309
Q21042798309
Q39 309
Q1321 307
Q79 307
Q1410903 306
Q914807 306
Q5462890 304
Q219 303
Q801 303
Q1457402 301
Q188 301
Q5546014 301
Q5567997 301
Q227 300
Q4587662 300
Q6334099 300
Q17 299
Q189 299
Q649 298
Q5614597 297
Q28 296
Q64 296
Q48029 295
Q220 294

Entities used by one project


In [58]:
entities_used_by_one_project <- subset(sorted_project_frequency_by_entity, n==1)

In [59]:
nrow(entities_used_by_one_project)


12953973

In [60]:
nrow(entities_used_by_one_project)/nrow(sorted_project_frequency_by_entity)


0.582200484215273

Male versus Female Bias

"Male" Item Usage


In [61]:
male_item_pages <- filter(sorted_page_frequency_by_entity, entity_id=="Q6581097")

In [62]:
head(male_item_pages)


entity_idn
Q6581097722871

"Female" Item Usage


In [63]:
female_item_pages <-filter(sorted_page_frequency_by_entity, entity_id=="Q6581072")

In [64]:
head(female_item_pages)


entity_idn
Q6581072151783

In [65]:
female_item_pages$n/male_item_pages$n


0.209972457049736

Entities used by more than one page


In [66]:
colnames(sorted_page_frequency_by_entity) <- c('entity_id','m')

In [67]:
project_and_page_frequency_by_entity <- merge(sorted_page_frequency_by_entity,sorted_project_frequency_by_entity)

In [73]:
head(project_and_page_frequency_by_entity, n= 100)


entity_idmn
P1 11 3
P10 424 33
P100 9 2
P1000 287 4
P10000 2 1
P1001 1006 10
P1002 14 3
P1003 56 10
P1004 27 2
P1005 220 9
P1006 238 13
P1007 11 2
P1008 2 2
P1009 2 2
P101 43706 37
P1010 52 2
P1011 67 3
P1012 25 3
P1013 107 2
P1014 143 3
P1015 104 19
P1016 205 2
P1017 223 10
P1018 3783 11
P1019 11 3
P102 61267 26
P1020 2 2
P1021 170 2
P1022 172 2
P1023 167 2
P1060 270 4
P1061 3 2
P1062 3 2
P1063 3 2
P1064 831 6
P1065 3370912
P1066 149913
P1067 166 2
P1068 15 3
P1069 14 2
P107 350 4
P1070 7342923
P1071 262 8
P1072 3991 6
P1073 3988 5
P1074 37 2
P1075 2970 7
P1076 275 5
P1077 31857 7
P1078 10 2
P1079 28 3
P108 4597221
P1080 177 6
P1081 261 7
P1082 50697645
P1083 2972 9
P1084 71 2
P1085 235 4
P1086 340 6
P1087 3486 6

In [69]:
two_or_more_pages_project_and_page_frequency_by_entity <- filter(project_and_page_frequency_by_entity, m >= 2)

In [74]:
head(two_or_more_pages_project_and_page_frequency_by_entity, n=100)


entity_idmn
P1 11 3
P10 424 33
P100 9 2
P1000 287 4
P10000 2 1
P1001 1006 10
P1002 14 3
P1003 56 10
P1004 27 2
P1005 220 9
P1006 238 13
P1007 11 2
P1008 2 2
P1009 2 2
P101 43706 37
P1010 52 2
P1011 67 3
P1012 25 3
P1013 107 2
P1014 143 3
P1015 104 19
P1016 205 2
P1017 223 10
P1018 3783 11
P1019 11 3
P102 61267 26
P1020 2 2
P1021 170 2
P1022 172 2
P1023 167 2
P1060 270 4
P1061 3 2
P1062 3 2
P1063 3 2
P1064 831 6
P1065 3370912
P1066 149913
P1067 166 2
P1068 15 3
P1069 14 2
P107 350 4
P1070 7342923
P1071 262 8
P1072 3991 6
P1073 3988 5
P1074 37 2
P1075 2970 7
P1076 275 5
P1077 31857 7
P1078 10 2
P1079 28 3
P108 4597221
P1080 177 6
P1081 261 7
P1082 50697645
P1083 2972 9
P1084 71 2
P1085 235 4
P1086 340 6
P1087 3486 6

In [76]:
two_or_more_projects_two_or_more_pages_project_and_page_frequency_by_entity <- filter(two_or_more_pages_project_and_page_frequency_by_entity, n >=2)

In [77]:
nrow(two_or_more_projects_two_or_more_pages_project_and_page_frequency_by_entity)/nrow(two_or_more_pages_project_and_page_frequency_by_entity)


0.916579382059491

In [ ]: