# loading all libraries
library(tidyverse)
library(forcats)
library(rsample)
library(dplyr)
library(ggplot2)
library(plotROC)
library(tidyverse)
library(rsample)
library(glmnet)
library(glmnetUtils)
library(forcats)
library(coefplot)
library(here)
1a) Generate new features and split the data into testing and training sets.
movies <- read.csv("/Users/nickkondo/OneDrive - Chapman University/BUS 696/datasets/IMDB_movies.csv")
movies_clean <-
movies %>%
mutate(budgetM = budget/1000000,
grossM = gross/1000000,
profitM = grossM - budgetM,
ROI = profitM/budgetM,
blockbuster = ifelse(profitM > 100, 1, 0) %>% factor(., levels = c("0","1")),
genre_main = as.factor(unlist(map(strsplit(as.character(movies$genre),"\\|"),1))) %>% fct_lump(12),
rating_simple = fct_lump(content_rating, n = 4)) %>%
filter(budget < 400000000,
content_rating != "",
content_rating != "Not Raated") %>%
mutate(rating_simple = rating_simple %>% fct_drop()) %>%
distinct()
movies_split <- initial_split(movies_clean)
movies_train <- training(movies_split)
movies_test <- testing(movies_split)
1b) The variable “blockbuster” in the movies dataframe equals 1 if a movie earns more than $100M USD in profit, and 0 otherwise. Fit a logistic regression model to predict whether a movie is a blockbuster using imdb_score, budgetM, title_year, and genre_main as predictors. Store this logistic model as movies_logit1 and run the summary command over the fitted logistic model object.
movies_logit1 <- glm(blockbuster ~ imdb_score + budgetM + title_year + genre_main,
family = binomial,
data = movies_clean)
summary(movies_logit1)
##
## Call:
## glm(formula = blockbuster ~ imdb_score + budgetM + title_year +
## genre_main, family = binomial, data = movies_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6273 -0.3365 -0.2196 -0.1310 3.7848
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 27.462266 12.691416 2.164
## imdb_score 1.239380 0.103212 12.008
## budgetM 0.011590 0.001487 7.795
## title_year -0.019602 0.006269 -3.127
## genre_mainAdventure 0.271368 0.218499 1.242
## genre_mainAnimation 0.382911 0.485534 0.789
## genre_mainBiography -0.808113 0.387794 -2.084
## genre_mainComedy 0.264825 0.232279 1.140
## genre_mainCrime -1.867656 0.541770 -3.447
## genre_mainDocumentary -1.275772 1.058388 -1.205
## genre_mainDrama -0.671320 0.265240 -2.531
## genre_mainFantasy -14.288517 606.075728 -0.024
## genre_mainHorror 0.081534 0.514399 0.159
## genre_mainMystery -14.852579 744.146251 -0.020
## genre_mainSci-Fi -15.011164 1343.251662 -0.011
## genre_mainOther 0.996882 0.945102 1.055
## Pr(>|z|)
## (Intercept) 0.030476 *
## imdb_score < 0.0000000000000002 ***
## budgetM 0.00000000000000644 ***
## title_year 0.001767 **
## genre_mainAdventure 0.214249
## genre_mainAnimation 0.430323
## genre_mainBiography 0.037172 *
## genre_mainComedy 0.254238
## genre_mainCrime 0.000566 ***
## genre_mainDocumentary 0.228052
## genre_mainDrama 0.011374 *
## genre_mainFantasy 0.981191
## genre_mainHorror 0.874061
## genre_mainMystery 0.984076
## genre_mainSci-Fi 0.991084
## genre_mainOther 0.291523
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1702.0 on 3792 degrees of freedom
## Residual deviance: 1341.1 on 3777 degrees of freedom
## AIC: 1373.1
##
## Number of Fisher Scoring iterations: 16
1c) Exponentiate the fitted coefficient vector and print the results.
exp(movies_logit1$coefficients)
## (Intercept) imdb_score
## 844715685676.3284912109375 3.4534715375103
## budgetM title_year
## 1.0116574601468 0.9805891246686
## genre_mainAdventure genre_mainAnimation
## 1.3117573463194 1.4665467912410
## genre_mainBiography genre_mainComedy
## 0.4456984199728 1.3032024874441
## genre_mainCrime genre_mainDocumentary
## 0.1544852935298 0.2792154021719
## genre_mainDrama genre_mainFantasy
## 0.5110333914144 0.0000006231261
## genre_mainHorror genre_mainMystery
## 1.0849496221323 0.0000003544924
## genre_mainSci-Fi genre_mainOther
## 0.0000003025063 2.7098183374907
1d) Interpret the coefficient on genre_mainAdventure.
The ratio of odds of a blockbuster movie across the Adventure genre is 1.31, or the Adventure genre has a 31% higher probability of being a blockbuster movie
1e) Interpret the coefficient on imdb_score.
The coefficient of 3.45 means that for every increase of 1 unit in imdb_score, the odds of a movie being a blockbuster increases by 245%.
1f) Suppose that we estimate a raw, un-exponentiated coefficient for a logistic regression of 1.05. Increasing this variable by one unit increases what by 1.05?
In a logistic regression we make predictions to lie in between 0 and 1, in terms of probabilities. The logistic models the dependent variable as the log-odds ratio, and this increases by 1.05 for every increase in one unit.
1g) What is the null hypothesis for exponentiated coefficients in a logistic regression?
The null hypothesis for exponentiation coefficients in a logistic regression are there is no relationship between the predictor variable and the response variable. In other words, the model does not predict the response variable any better than with no predictor variables at all.
1h) Using the movies_logit1 model generate predicted probabilities for the test and training sets.
preds_train <- predict(movies_logit1, newdata = movies_train)
preds_test <- predict(movies_logit1, newdata = movies_test)
head(preds_train)
## 1528 587 2867 1795 71 684
## -1.813262 -3.948243 -3.175616 -2.533330 -3.772698 -2.985199
head(preds_test)
## 1 3 5 8 11 19
## 0.6203098 -0.7678983 -0.7402354 0.1576178 -1.8762899 -0.9408938
1i) Generate results dataframes.
results_train <- data.frame( `truth`= movies_train %>% select(blockbuster) %>%
mutate(blockbuster = as.numeric(blockbuster)),
`Class1` = preds_train,
`type` = rep("train",length(preds_train))
)
results_test <- data.frame(
`truth` = movies_test %>% select(blockbuster) %>%
mutate(blockbuster = as.numeric(blockbuster)),
`Class1` = preds_test,
`type` = rep("test",length(preds_test))
)
results <- bind_rows(results_train, results_test)
1j) Use the slice() function to print the first ten rows of the results dataframe.
The blockbuster variable returns a 1 if the movie earns more than $100M in profit, otherwise a 0 is returned. Class1 is the probability of a “blockbuster” movie. Type returns if the varaible is in the train set or the testing set.
slice(results_train, 1:10)
slice(results_test, 1:10)
1k) Generate two ROC plots, one each for the test and training sets. Be sure to label the cutoff probabilities along the ROC lines using the cutoffs.at = c(0.99,0.9,0.7,0.5,0.3,0.1,0).
p_train <- ggplot(results_train,
aes(m=Class1, d = blockbuster)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.99,0.9,0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16)
p_test <- ggplot(results_test,
aes(m=Class1, d = blockbuster)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.99,0.9,0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16)
print(p_train)
print(p_test)
1l) Calculate the AUC for the test and training sets using the function calc_auc.
calc_auc(p_train)
calc_auc(p_test)
AUC means area under the ROC curve. A higher AUC is better because it means a better ROC plot which measures the consequence on true positive fraction and false positive fraction at different cutoffs.
2a) Create a ridge model against movie_train using variables grossM, duration, rating_simple, aspect_ratio, color, movie_facebook_likes, director_facebook_likes, cast_total_facebook_likes, num_user_for_reviews, num_critic_for_reviews, and genre_main.
ridge_mod <- cv.glmnet(grossM ~ duration + rating_simple + aspect_ratio + color + movie_facebook_likes + director_facebook_likes + cast_total_facebook_likes + num_user_for_reviews + num_critic_for_reviews + genre_main, data = movies_train)
2b) Use the coef function over the ridge_fit object to print the coefficient matrix.
print(ridge_mod$lambda.min)
## [1] 0.619576
print(ridge_mod$lambda.1se)
## [1] 4.797152
coef(ridge_mod, s = ridge_mod$lambda.min) %>%
round(3)
## 28 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -7.068
## duration 0.258
## rating_simpleG 27.887
## rating_simplePG 22.197
## rating_simplePG-13 .
## rating_simpleR -26.110
## rating_simpleOther -20.064
## aspect_ratio -3.159
## colorBlack and White -15.703
## colorColor 0.000
## movie_facebook_likes 0.000
## director_facebook_likes 0.000
## cast_total_facebook_likes 0.000
## num_user_for_reviews 0.064
## num_critic_for_reviews 0.101
## genre_mainAction 13.865
## genre_mainAdventure 15.869
## genre_mainAnimation 24.957
## genre_mainBiography -4.860
## genre_mainComedy 5.195
## genre_mainCrime -4.414
## genre_mainDocumentary -4.741
## genre_mainDrama -12.550
## genre_mainFantasy .
## genre_mainHorror -2.973
## genre_mainMystery -2.411
## genre_mainSci-Fi -20.537
## genre_mainOther 24.541
coef(ridge_mod, s = ridge_mod$lambda.1se) %>%
round(3)
## 28 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 16.631
## duration 0.047
## rating_simpleG 3.258
## rating_simplePG 14.061
## rating_simplePG-13 .
## rating_simpleR -23.144
## rating_simpleOther -1.725
## aspect_ratio .
## colorBlack and White .
## colorColor .
## movie_facebook_likes 0.000
## director_facebook_likes .
## cast_total_facebook_likes 0.000
## num_user_for_reviews 0.062
## num_critic_for_reviews 0.089
## genre_mainAction 5.089
## genre_mainAdventure 8.231
## genre_mainAnimation .
## genre_mainBiography .
## genre_mainComedy .
## genre_mainCrime .
## genre_mainDocumentary .
## genre_mainDrama -6.735
## genre_mainFantasy .
## genre_mainHorror .
## genre_mainMystery .
## genre_mainSci-Fi .
## genre_mainOther .
2c) Use the plot function over the ridge fit object. Describe the plot in your own words, including the two vertical dashed lines.
plot(ridge_mod)
2d) Estimate a lasso model using the function cv.lasso, using the same independent and dependent variables as before, and store the model as lasso_fit.
lasso_fit <- cv.glmnet(grossM ~ duration + rating_simple + aspect_ratio + color + movie_facebook_likes + director_facebook_likes + cast_total_facebook_likes + num_user_for_reviews + num_critic_for_reviews + genre_main, data = movies_train, alpha = 1)
2e) Use the coef(lasso_fit) function over the lasso fit object.
coef(lasso_fit)
## 28 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 14.23847859908
## duration 0.06491974681
## rating_simpleG 6.10655027540
## rating_simplePG 15.00789965467
## rating_simplePG-13 .
## rating_simpleR -23.57834394192
## rating_simpleOther -3.87188291769
## aspect_ratio .
## colorBlack and White .
## colorColor .
## movie_facebook_likes 0.00008346256
## director_facebook_likes .
## cast_total_facebook_likes 0.00015814728
## num_user_for_reviews 0.06212181317
## num_critic_for_reviews 0.08942389397
## genre_mainAction 5.76913492881
## genre_mainAdventure 8.71352522210
## genre_mainAnimation .
## genre_mainBiography .
## genre_mainComedy .
## genre_mainCrime .
## genre_mainDocumentary .
## genre_mainDrama -7.50306395567
## genre_mainFantasy .
## genre_mainHorror .
## genre_mainMystery .
## genre_mainSci-Fi .
## genre_mainOther .
2f) Explain, in your own words, why some variables do not have estimated coefficients in the above table.
Some variables do not have coefficients because they do not improve the the model or the cross-validated MSE and are better off left out of the model.
2g) Call the plot function over the lasso fit object and describe the plot. What do the numbers at the top of the figure refer to?
The numbers at the top show the number of coefficients. As we add more coefficients, the mean squared error lowers as we reach 26 coefficients.
plot(lasso_fit)