# loading all libraries
library(tidyverse)
library(forcats)
library(rsample)
library(dplyr)
library(ggplot2)
library(plotROC)
library(tidyverse)
library(rsample)
library(glmnet)
library(glmnetUtils)
library(forcats)
library(coefplot)
library(here)

Question 1: Logistic Regression

1a) Generate new features and split the data into testing and training sets.


movies <- read.csv("/Users/nickkondo/OneDrive - Chapman University/BUS 696/datasets/IMDB_movies.csv")

movies_clean <- 
  movies %>% 
  mutate(budgetM = budget/1000000,
         grossM = gross/1000000,
         profitM = grossM - budgetM,
         ROI = profitM/budgetM,
         blockbuster = ifelse(profitM > 100, 1, 0) %>% factor(., levels = c("0","1")),
         genre_main = as.factor(unlist(map(strsplit(as.character(movies$genre),"\\|"),1))) %>% fct_lump(12),
         rating_simple = fct_lump(content_rating, n = 4)) %>% 
  filter(budget < 400000000, 
         content_rating != "",
         content_rating != "Not Raated") %>% 
  mutate(rating_simple = rating_simple %>%  fct_drop()) %>% 
  distinct()

movies_split <- initial_split(movies_clean)
movies_train <- training(movies_split)
movies_test <- testing(movies_split)

1b) The variable “blockbuster” in the movies dataframe equals 1 if a movie earns more than $100M USD in profit, and 0 otherwise. Fit a logistic regression model to predict whether a movie is a blockbuster using imdb_score, budgetM, title_year, and genre_main as predictors. Store this logistic model as movies_logit1 and run the summary command over the fitted logistic model object.


movies_logit1 <- glm(blockbuster ~ imdb_score + budgetM + title_year + genre_main, 
                      family = binomial,
                      data = movies_clean)

summary(movies_logit1)
## 
## Call:
## glm(formula = blockbuster ~ imdb_score + budgetM + title_year + 
##     genre_main, family = binomial, data = movies_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6273  -0.3365  -0.2196  -0.1310   3.7848  
## 
## Coefficients:
##                          Estimate  Std. Error z value
## (Intercept)             27.462266   12.691416   2.164
## imdb_score               1.239380    0.103212  12.008
## budgetM                  0.011590    0.001487   7.795
## title_year              -0.019602    0.006269  -3.127
## genre_mainAdventure      0.271368    0.218499   1.242
## genre_mainAnimation      0.382911    0.485534   0.789
## genre_mainBiography     -0.808113    0.387794  -2.084
## genre_mainComedy         0.264825    0.232279   1.140
## genre_mainCrime         -1.867656    0.541770  -3.447
## genre_mainDocumentary   -1.275772    1.058388  -1.205
## genre_mainDrama         -0.671320    0.265240  -2.531
## genre_mainFantasy      -14.288517  606.075728  -0.024
## genre_mainHorror         0.081534    0.514399   0.159
## genre_mainMystery      -14.852579  744.146251  -0.020
## genre_mainSci-Fi       -15.011164 1343.251662  -0.011
## genre_mainOther          0.996882    0.945102   1.055
##                                   Pr(>|z|)    
## (Intercept)                       0.030476 *  
## imdb_score            < 0.0000000000000002 ***
## budgetM                0.00000000000000644 ***
## title_year                        0.001767 ** 
## genre_mainAdventure               0.214249    
## genre_mainAnimation               0.430323    
## genre_mainBiography               0.037172 *  
## genre_mainComedy                  0.254238    
## genre_mainCrime                   0.000566 ***
## genre_mainDocumentary             0.228052    
## genre_mainDrama                   0.011374 *  
## genre_mainFantasy                 0.981191    
## genre_mainHorror                  0.874061    
## genre_mainMystery                 0.984076    
## genre_mainSci-Fi                  0.991084    
## genre_mainOther                   0.291523    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1702.0  on 3792  degrees of freedom
## Residual deviance: 1341.1  on 3777  degrees of freedom
## AIC: 1373.1
## 
## Number of Fisher Scoring iterations: 16

1c) Exponentiate the fitted coefficient vector and print the results.


exp(movies_logit1$coefficients)
##                (Intercept)                 imdb_score 
## 844715685676.3284912109375            3.4534715375103 
##                    budgetM                 title_year 
##            1.0116574601468            0.9805891246686 
##        genre_mainAdventure        genre_mainAnimation 
##            1.3117573463194            1.4665467912410 
##        genre_mainBiography           genre_mainComedy 
##            0.4456984199728            1.3032024874441 
##            genre_mainCrime      genre_mainDocumentary 
##            0.1544852935298            0.2792154021719 
##            genre_mainDrama          genre_mainFantasy 
##            0.5110333914144            0.0000006231261 
##           genre_mainHorror          genre_mainMystery 
##            1.0849496221323            0.0000003544924 
##           genre_mainSci-Fi            genre_mainOther 
##            0.0000003025063            2.7098183374907

1d) Interpret the coefficient on genre_mainAdventure.

The ratio of odds of a blockbuster movie across the Adventure genre is 1.31, or the Adventure genre has a 31% higher probability of being a blockbuster movie

1e) Interpret the coefficient on imdb_score.

The coefficient of 3.45 means that for every increase of 1 unit in imdb_score, the odds of a movie being a blockbuster increases by 245%.

1f) Suppose that we estimate a raw, un-exponentiated coefficient for a logistic regression of 1.05. Increasing this variable by one unit increases what by 1.05?

In a logistic regression we make predictions to lie in between 0 and 1, in terms of probabilities. The logistic models the dependent variable as the log-odds ratio, and this increases by 1.05 for every increase in one unit.

1g) What is the null hypothesis for exponentiated coefficients in a logistic regression?

The null hypothesis for exponentiation coefficients in a logistic regression are there is no relationship between the predictor variable and the response variable. In other words, the model does not predict the response variable any better than with no predictor variables at all.

1h) Using the movies_logit1 model generate predicted probabilities for the test and training sets.


preds_train <- predict(movies_logit1, newdata = movies_train)

preds_test <- predict(movies_logit1, newdata = movies_test)

head(preds_train)
##      1528       587      2867      1795        71       684 
## -1.813262 -3.948243 -3.175616 -2.533330 -3.772698 -2.985199
head(preds_test)
##          1          3          5          8         11         19 
##  0.6203098 -0.7678983 -0.7402354  0.1576178 -1.8762899 -0.9408938

1i) Generate results dataframes.


results_train <- data.frame( `truth`= movies_train %>% select(blockbuster) %>% 
    mutate(blockbuster = as.numeric(blockbuster)),
  `Class1` =  preds_train,
  `type` = rep("train",length(preds_train))
)

results_test <- data.frame(
  `truth` = movies_test   %>% select(blockbuster) %>% 
    mutate(blockbuster = as.numeric(blockbuster)),
  `Class1` =  preds_test,
  `type` = rep("test",length(preds_test))
)

results <- bind_rows(results_train, results_test)

1j) Use the slice() function to print the first ten rows of the results dataframe.

The blockbuster variable returns a 1 if the movie earns more than $100M in profit, otherwise a 0 is returned. Class1 is the probability of a “blockbuster” movie. Type returns if the varaible is in the train set or the testing set.


slice(results_train, 1:10)
slice(results_test, 1:10)

1k) Generate two ROC plots, one each for the test and training sets. Be sure to label the cutoff probabilities along the ROC lines using the cutoffs.at = c(0.99,0.9,0.7,0.5,0.3,0.1,0).

p_train <- ggplot(results_train,
            aes(m=Class1, d = blockbuster)) +
    geom_roc(labelsize = 3.5, 
             cutoffs.at = c(0.99,0.9,0.7,0.5,0.3,0.1,0)) +
  theme_minimal(base_size = 16)

p_test <- ggplot(results_test,
            aes(m=Class1, d = blockbuster)) +
    geom_roc(labelsize = 3.5, 
             cutoffs.at = c(0.99,0.9,0.7,0.5,0.3,0.1,0)) +
  theme_minimal(base_size = 16)

print(p_train)

print(p_test)

1l) Calculate the AUC for the test and training sets using the function calc_auc.


calc_auc(p_train)
calc_auc(p_test)

AUC means area under the ROC curve. A higher AUC is better because it means a better ROC plot which measures the consequence on true positive fraction and false positive fraction at different cutoffs.

Question 2: Regularized Regression

2a) Create a ridge model against movie_train using variables grossM, duration, rating_simple, aspect_ratio, color, movie_facebook_likes, director_facebook_likes, cast_total_facebook_likes, num_user_for_reviews, num_critic_for_reviews, and genre_main.


ridge_mod <- cv.glmnet(grossM ~ duration + rating_simple + aspect_ratio + color + movie_facebook_likes + director_facebook_likes + cast_total_facebook_likes + num_user_for_reviews + num_critic_for_reviews + genre_main, data = movies_train)

2b) Use the coef function over the ridge_fit object to print the coefficient matrix.


print(ridge_mod$lambda.min)
## [1] 0.619576
print(ridge_mod$lambda.1se)
## [1] 4.797152

coef(ridge_mod, s = ridge_mod$lambda.min) %>% 
  round(3)
## 28 x 1 sparse Matrix of class "dgCMatrix"
##                                s1
## (Intercept)                -7.068
## duration                    0.258
## rating_simpleG             27.887
## rating_simplePG            22.197
## rating_simplePG-13          .    
## rating_simpleR            -26.110
## rating_simpleOther        -20.064
## aspect_ratio               -3.159
## colorBlack and White      -15.703
## colorColor                  0.000
## movie_facebook_likes        0.000
## director_facebook_likes     0.000
## cast_total_facebook_likes   0.000
## num_user_for_reviews        0.064
## num_critic_for_reviews      0.101
## genre_mainAction           13.865
## genre_mainAdventure        15.869
## genre_mainAnimation        24.957
## genre_mainBiography        -4.860
## genre_mainComedy            5.195
## genre_mainCrime            -4.414
## genre_mainDocumentary      -4.741
## genre_mainDrama           -12.550
## genre_mainFantasy           .    
## genre_mainHorror           -2.973
## genre_mainMystery          -2.411
## genre_mainSci-Fi          -20.537
## genre_mainOther            24.541

coef(ridge_mod, s = ridge_mod$lambda.1se) %>% 
  round(3)
## 28 x 1 sparse Matrix of class "dgCMatrix"
##                                s1
## (Intercept)                16.631
## duration                    0.047
## rating_simpleG              3.258
## rating_simplePG            14.061
## rating_simplePG-13          .    
## rating_simpleR            -23.144
## rating_simpleOther         -1.725
## aspect_ratio                .    
## colorBlack and White        .    
## colorColor                  .    
## movie_facebook_likes        0.000
## director_facebook_likes     .    
## cast_total_facebook_likes   0.000
## num_user_for_reviews        0.062
## num_critic_for_reviews      0.089
## genre_mainAction            5.089
## genre_mainAdventure         8.231
## genre_mainAnimation         .    
## genre_mainBiography         .    
## genre_mainComedy            .    
## genre_mainCrime             .    
## genre_mainDocumentary       .    
## genre_mainDrama            -6.735
## genre_mainFantasy           .    
## genre_mainHorror            .    
## genre_mainMystery           .    
## genre_mainSci-Fi            .    
## genre_mainOther             .

2c) Use the plot function over the ridge fit object. Describe the plot in your own words, including the two vertical dashed lines.


plot(ridge_mod)

2d) Estimate a lasso model using the function cv.lasso, using the same independent and dependent variables as before, and store the model as lasso_fit.


lasso_fit <- cv.glmnet(grossM ~ duration + rating_simple + aspect_ratio + color + movie_facebook_likes + director_facebook_likes + cast_total_facebook_likes + num_user_for_reviews + num_critic_for_reviews + genre_main, data = movies_train, alpha = 1)

2e) Use the coef(lasso_fit) function over the lasso fit object.


coef(lasso_fit)
## 28 x 1 sparse Matrix of class "dgCMatrix"
##                                        s1
## (Intercept)                14.23847859908
## duration                    0.06491974681
## rating_simpleG              6.10655027540
## rating_simplePG            15.00789965467
## rating_simplePG-13          .            
## rating_simpleR            -23.57834394192
## rating_simpleOther         -3.87188291769
## aspect_ratio                .            
## colorBlack and White        .            
## colorColor                  .            
## movie_facebook_likes        0.00008346256
## director_facebook_likes     .            
## cast_total_facebook_likes   0.00015814728
## num_user_for_reviews        0.06212181317
## num_critic_for_reviews      0.08942389397
## genre_mainAction            5.76913492881
## genre_mainAdventure         8.71352522210
## genre_mainAnimation         .            
## genre_mainBiography         .            
## genre_mainComedy            .            
## genre_mainCrime             .            
## genre_mainDocumentary       .            
## genre_mainDrama            -7.50306395567
## genre_mainFantasy           .            
## genre_mainHorror            .            
## genre_mainMystery           .            
## genre_mainSci-Fi            .            
## genre_mainOther             .

2f) Explain, in your own words, why some variables do not have estimated coefficients in the above table.

Some variables do not have coefficients because they do not improve the the model or the cross-validated MSE and are better off left out of the model.

2g) Call the plot function over the lasso fit object and describe the plot. What do the numbers at the top of the figure refer to?

The numbers at the top show the number of coefficients. As we add more coefficients, the mean squared error lowers as we reach 26 coefficients.


plot(lasso_fit)