In [19]:
library(dplyr)

In [20]:
quality_prediction_and_page_views <- read.table("../results/sql_queries/entity_views_and_aggregated_revisions/entity_views_and_aggregated_revisions_and_quality_scoring_prediction_converted_20130701.tsv", header=FALSE, sep="\t")

In [21]:
colnames(quality_prediction_and_page_views) <- c('entity_id','number_of_revisions', 'page_views', 'prediction', 'ordinal_score')

In [22]:
head(quality_prediction_and_page_views)


entity_idnumber_of_revisionspage_viewspredictionordinal_score
Q1000999 33 736 E 1
Q1001536411 1 E 1
Q10018576 9 21 E 1
Q1002034824 11 E 1
Q1002083211 12 E 1
Q1002822013 6 E 1

In [23]:
cor(quality_prediction_and_page_views$page_views,quality_prediction_and_page_views$ordinal_score, method="spearman")


0.256714011897879

In [24]:
quality_prediction_and_page_views_model <- lm(quality_prediction_and_page_views$page_views ~ quality_prediction_and_page_views$ordinal_score)

In [25]:
summary(quality_prediction_and_page_views_model)


Call:
lm(formula = quality_prediction_and_page_views$page_views ~ quality_prediction_and_page_views$ordinal_score)

Residuals:
       Min         1Q     Median         3Q        Max 
-3.937e+05 -2.239e+04 -2.213e+04 -2.081e+04  1.253e+10 

Coefficients:
                                                Estimate Std. Error t value
(Intercept)                                       -89695       5555  -16.15
quality_prediction_and_page_views$ordinal_score   112111       4410   25.42
                                                Pr(>|t|)    
(Intercept)                                       <2e-16 ***
quality_prediction_and_page_views$ordinal_score   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7304000 on 11668022 degrees of freedom
Multiple R-squared:  5.538e-05,	Adjusted R-squared:  5.529e-05 
F-statistic: 646.2 on 1 and 11668022 DF,  p-value: < 2.2e-16

Class A Items with Least Pages Views


In [26]:
class_a_quality_prediction_and_page_views <- filter(quality_prediction_and_page_views, prediction=="A")

In [27]:
sorted_ascend_class_a_quality_prediction_and_page_views <- dplyr::arrange(class_a_quality_prediction_and_page_views, page_views)

In [28]:
head(sorted_ascend_class_a_quality_prediction_and_page_views, n=10)


entity_idnumber_of_revisionspage_viewspredictionordinal_score
Q3152321 687 77136A 5
Q273461 1176 241736A 5
Q159 1945 653480145A 5

Class E Items with Most Pages Views


In [ ]:


In [29]:
class_e_quality_prediction_and_page_views <- filter(quality_prediction_and_page_views, prediction=="E")

In [30]:
sorted_desc_class_e_quality_prediction_and_page_views <- dplyr::arrange(class_e_quality_prediction_and_page_views, desc(page_views))

In [31]:
head(sorted_desc_class_e_quality_prediction_and_page_views, n=10)


entity_idnumber_of_revisionspage_viewspredictionordinal_score
Q2597810 100 2128920607E 1
Q623578 135 2097991400E 1
Q750403 192 2084693498E 1
Q1967876 102 2084215818E 1
Q150248 68 2068796814E 1
Q1868372 45 2056080224E 1
Q4584301 55 2052339927E 1
Q105584 75 2049926923E 1
Q40629 195 2049755644E 1
Q31165 67 2048330818E 1

In [32]:
nrow(class_e_quality_prediction_and_page_views)


10345136

In [33]:
nrow(filter(quality_prediction_and_page_views, prediction=="D"))


755623

In [34]:
nrow(filter(quality_prediction_and_page_views, prediction=="C"))


562134

In [35]:
nrow(filter(quality_prediction_and_page_views, prediction=="B"))


5128

In [36]:
nrow(filter(quality_prediction_and_page_views, prediction=="A"))


3

In [ ]: