In [1]:
import pandas as pd

# Import from the rpy2 module
from rpy2 import robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

In [2]:
stats = importr('stats') # Needed to run a linear regression in R
base = importr('base') # Needed to output the summary of the linear regression

In [3]:
pandas2ri.activate() # Automatically convert pandas Data Frames to R Data Frames

In [4]:
df = pd.read_csv('files/all_claims_files.csv')

In [5]:
robjects.globalenv['df'] = df # Add the pandas Data Frame to the R environment

In [6]:
model = stats.lm('MEDREIMB_IP ~ BENE_ESRD_IND', data=base.as_symbol('df')) # Call the linear regression

In [7]:
pd.set_option('max_rows', 100)

In [8]:
print(base.summary(model)) # output linear regression results


Call:
(function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <- x
    ret.y <- y
    cl <- match.call()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("formula", "data", "subset", "weights", "na.action", 
        "offset"), names(mf), 0L)
    mf <- mf[c(1L, m)]
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- quote(stats::model.frame)
    mf <- eval(mf, parent.frame())
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <- attr(mf, "terms")
    y <- model.response(mf, "numeric")
    w <- as.vector(model.weights(mf))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <- as.vector(model.offset(mf))
    if (!is.null(offset)) {
        if (length(offset) != NROW(y)) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                length(offset), NROW(y)), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <- NULL
        z <- list(coefficients = if (is.matrix(y)) matrix(, 0, 
            3) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else if (is.matrix(y)) nrow(y) else length(y))
        if (!is.null(offset)) {
            z$fitted.values <- offset
            z$residuals <- y - offset
        }
    }
    else {
        x <- model.matrix(mt, mf, contrasts)
        z <- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <- c(if (is.matrix(y)) "mlm", "lm")
    z$na.action <- attr(mf, "na.action")
    z$offset <- offset
    z$contrasts <- attr(x, "contrasts")
    z$xlevels <- .getXlevels(mt, mf)
    z$call <- cl
    z$terms <- mt
    if (model) 
        z$model <- mf
    if (ret.x) 
        z$x <- x
    if (ret.y) 
        z$y <- y
    if (!qr) 
        z$qr <- NULL
    z
})(formula = "MEDREIMB_IP ~ BENE_ESRD_IND", data = df)

Residuals:
   Min     1Q Median     3Q    Max 
 -8576  -1460  -1460  -1460 114094 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      1459.6      101.7   14.35   <2e-16 ***
BENE_ESRD_INDY   7116.6      356.9   19.94   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7551 on 5998 degrees of freedom
Multiple R-squared:  0.06215,	Adjusted R-squared:  0.062 
F-statistic: 397.5 on 1 and 5998 DF,  p-value: < 2.2e-16



In [ ]: