In [2]:
library(dplyr)

In [3]:
quality_prediction_and_page_views <- read.table("../results/sql_queries/entity_views_and_aggregated_revisions/entity_views_and_aggregated_revisions_and_quality_scoring_prediction_converted_20170701.tsv", header=FALSE, sep="\t")

In [4]:
colnames(quality_prediction_and_page_views) <- c('entity_id','number_of_revisions', 'page_views', 'prediction', 'ordinal_score')

In [5]:
head(quality_prediction_and_page_views)


entity_idnumber_of_revisionspage_viewspredictionordinal_score
Q1000999 33 736 C 3
Q1001536411 1 E 1
Q10018576 9 21 E 1
Q1002034824 11 E 1
Q1002083211 12 E 1
Q1002822013 6 E 1

In [6]:
cor(quality_prediction_and_page_views$page_views,quality_prediction_and_page_views$ordinal_score, method="spearman")


0.138832780574458

In [7]:
quality_prediction_and_page_views_model <- lm(quality_prediction_and_page_views$page_views ~ quality_prediction_and_page_views$ordinal_score)

In [8]:
summary(quality_prediction_and_page_views_model)


Call:
lm(formula = quality_prediction_and_page_views$page_views ~ quality_prediction_and_page_views$ordinal_score)

Residuals:
       Min         1Q     Median         3Q        Max 
-1.313e+05 -3.261e+04  8.080e+02  9.490e+02  1.253e+10 

Coefficients:
                                                Estimate Std. Error t value
(Intercept)                                       -34220       2606  -13.13
quality_prediction_and_page_views$ordinal_score    33414       1318   25.36
                                                Pr(>|t|)    
(Intercept)                                       <2e-16 ***
quality_prediction_and_page_views$ordinal_score   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5582000 on 22151121 degrees of freedom
Multiple R-squared:  2.902e-05,	Adjusted R-squared:  2.898e-05 
F-statistic: 642.9 on 1 and 22151121 DF,  p-value: < 2.2e-16

Class A Items with Least Pages Views


In [9]:
class_a_quality_prediction_and_page_views <- filter(quality_prediction_and_page_views, prediction=="A")

In [10]:
sorted_ascend_class_a_quality_prediction_and_page_views <- dplyr::arrange(class_a_quality_prediction_and_page_views, page_views)

In [11]:
head(sorted_ascend_class_a_quality_prediction_and_page_views, n=10)


entity_idnumber_of_revisionspage_viewspredictionordinal_score
Q8210649 82 1549 A 5
Q708515 99 2365 A 5
Q2914608 96 2680 A 5
Q1366473119 2732 A 5
Q158881 102 2783 A 5
Q732221 93 3505 A 5
Q895898 126 3577 A 5
Q159145 93 3644 A 5
Q63782 93 3791 A 5
Q3273285 91 3817 A 5

Class E Items with Most Pages Views


In [ ]:


In [12]:
class_e_quality_prediction_and_page_views <- filter(quality_prediction_and_page_views, prediction=="E")

In [13]:
sorted_desc_class_e_quality_prediction_and_page_views <- dplyr::arrange(class_e_quality_prediction_and_page_views, desc(page_views))

In [14]:
head(sorted_desc_class_e_quality_prediction_and_page_views, n=10)


entity_idnumber_of_revisionspage_viewspredictionordinal_score
Q6883832 25 2045602031E 1
Q4043051 32 2045595548E 1
Q4442644 9 2045584646E 1
Q4299813 13 2045575479E 1
Q4380129 15 2045574094E 1
Q18241050 6 2045553487E 1
Q19902884 32 1141733962E 1
Q22302160 16 1139176637E 1
Q6007395 12 225642217E 1
Q23312670 4 106919685E 1

In [15]:
nrow(class_e_quality_prediction_and_page_views)


11743883

In [16]:
nrow(filter(quality_prediction_and_page_views, prediction=="D"))


4472790

In [17]:
nrow(filter(quality_prediction_and_page_views, prediction=="C"))


5429662

In [18]:
nrow(filter(quality_prediction_and_page_views, prediction=="B"))


500819

In [19]:
nrow(filter(quality_prediction_and_page_views, prediction=="A"))


3969

In [ ]: