# loading all libraries
library(tidyverse)
library(dplyr)
library(readr)
library(forcats) 
library(rsample)

Question 1: Training and Testing Datasets

1a) Load the IMDB_movies dataset. Create variables grossM and budgetM that are budget and gross respectively in units of 1 Million. Also, create two new variables: log_gross and log_budget that are equal to the natural log of gross and budget. Store the new data frame as the object movies_clean.


setwd("/Users/nickkondo/OneDrive - Chapman University/BUS 696")

movies <- read_csv("datasets/IMDB_movies.csv")

movies <-
  movies %>% 
  mutate(grossM = gross / 1000000, 
         budgetM = budget / 1000000)

movies_clean <-
  movies %>% 
  mutate(log_gross = log(gross),
         log_budget = log(budget))

1b) Create a factor variable from content_rating. Note Store this new variable as rating_factor and add it to the movies_clean dataset.

factofct_explicit_na creates a factor variable where, if there is a blank response, an explicit factor level is created rather than a missing value. We are creating a variable from content_rating where missing values will be stored as “na”.


movies_clean <-
  movies_clean %>% 
  mutate(rating_factor = fct_explicit_na(content_rating))

1c) Create a new variable called rating_simple, using rating_factor as its source, that explicitly lists the four most common factor levels and the rest are given the category “Other”. Add this new variable to the movies_clean dataset.


movies_clean <-
  movies_clean %>% 
  mutate(rating_simple = fct_lump_n(rating_factor, n =4))

levels(movies_clean$rating_simple)
## [1] "G"     "PG"    "PG-13" "R"     "Other"

1d) Create a two way frequency table of content_rating against rating_simple to confirm the mutate function was successful.


table(movies_clean$content_rating,  movies_clean$rating_simple)
##            
##                G   PG PG-13    R Other
##   Approved     0    0     0    0    17
##   G           91    0     0    0     0
##   GP           0    0     0    0     1
##   M            0    0     0    0     2
##   NC-17        0    0     0    0     6
##   Not Rated    0    0     0    0    42
##   Passed       0    0     0    0     3
##   PG           0  576     0    0     0
##   PG-13        0    0  1329    0     0
##   R            0    0     0 1737     0
##   Unrated      0    0     0    0    24
##   X            0    0     0    0    10

1e) Split the movies_clean dataset into training and testing sets of 75% and 25% each respectively. Call these objects movies_train and movies_test.


set.seed(4)

movies_split <- initial_split(movies_clean, prop = 0.75)
movies_train <- training(movies_split)
movies_test <- testing(movies_split)

1f) Why do we split our dataset into training aand testing sets?

We split our dataset into test and training sets so that we can use a proportion of our data to allow our model to learn patterns and then we use a proportion as a testing set so we can apply our model to some data and make sure it can accurately predict data as well as recognize patterns in the old data. We can avoid things like over fit when we split our data set and ensure the model is more accurate.

Question 2: Predicting Movie Gross

2a) Estimate a linear regression model where grossM is the dependent variable and imdb_score is the independent variable. Store this object as mod1 and run the summary command against this model.


mod1 <- lm(grossM ~ imdb_score, data = movies_clean)
summary(mod1)
## 
## Call:
## lm(formula = grossM ~ imdb_score, data = movies_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82.05 -43.34 -17.51  16.96 689.37 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  -39.291      6.786   -5.79        0.00000000759 ***
## imdb_score    13.978      1.036   13.49 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.24 on 3887 degrees of freedom
## Multiple R-squared:  0.04473,    Adjusted R-squared:  0.04449 
## F-statistic:   182 on 1 and 3887 DF,  p-value: < 0.00000000000000022

2b) Interpret the coefficient for imdb_score, being specific about the magnitude of the impact of the variable on gross, and the sign (positive or negative).

Increasing imdb_score by 1 unit increases gross by 13.978 million holding everything else fixed

2c) What does a p-value in a linear regression model measure?

P-value tells us that if the null hypothesis was true, the likelihood of us getting a value as extreme as the one seen. P-value shows us a probability for our hypothesis test and the closer the p-value is to 1, the more likely there is no evidence against the null hypothesis. The closer the p-value is to 0, the stronger the evidence against the null hypothesis.

2d) What is the p-value for the coefficient for imdb_score. What does this p-value imply about the relationship between imdb_score and gross?

The p-value for the coefficient imdb_score is < 2e-16 which implies that the result is not likely due to chance.

2e) Estimate a regression model with log_gross as the dependent variable imdb_score, log_budget, title_year, and rating_simple as the independent variables using the training dataset. Store this model as mod2.

The dependent variable is log_gross and the independent variables are imdb_score, log_budget, title_year, rating_simple.


mod2 <- 
  lm(log_gross ~ imdb_score + log_budget + title_year + rating_simple, data = movies_clean)

2f) Run the summary function against mod2 to show the variables from the regression.


summary(mod2)
## 
## Call:
## lm(formula = log_gross ~ imdb_score + log_budget + title_year + 
##     rating_simple, data = movies_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0210  -0.5734   0.3374   0.9778   7.7445 
## 
## Coefficients:
##                     Estimate Std. Error t value             Pr(>|t|)
## (Intercept)        77.740525   5.698630  13.642 < 0.0000000000000002
## imdb_score          0.283645   0.026204  10.824 < 0.0000000000000002
## log_budget          0.748710   0.019486  38.424 < 0.0000000000000002
## title_year         -0.037569   0.002861 -13.131 < 0.0000000000000002
## rating_simplePG     0.162142   0.190862   0.850              0.39564
## rating_simplePG-13  0.092028   0.184358   0.499              0.61768
## rating_simpleR     -0.572770   0.183226  -3.126              0.00178
## rating_simpleOther -2.614717   0.229492 -11.393 < 0.0000000000000002
##                       
## (Intercept)        ***
## imdb_score         ***
## log_budget         ***
## title_year         ***
## rating_simplePG       
## rating_simplePG-13    
## rating_simpleR     ** 
## rating_simpleOther ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.688 on 3881 degrees of freedom
## Multiple R-squared:  0.4273, Adjusted R-squared:  0.4262 
## F-statistic: 413.6 on 7 and 3881 DF,  p-value: < 0.00000000000000022

2h) Why is there no coefficient estimated for rating_simpleG?

There is no coefficient for rating_simpleG because that variable is used as a base factor for the other factors. For example, the coefficient generated for rating_simplePG is interpreted relative and in comparison to rating_simpleG. Rating_simpleG is what happens when all other coefficients for rating_simple are 0.

2i) Interpret the coefficient on log_budget_M, and about the effect this variable has on movie gross including sign and magnitude of effect.

a 1% increase in log_budget results in a .75% increase in the log_gross.

2j) What does the coefficient for budget suggest about the return on investment to movie budgets?

This coefficient suggests that as you increase budget, the gross increases at a slower rate and therefore the ROI may start to decrease at some point.

Question 3: Generating Movie Gross Predictions

3a) Use mod2 to generate gross predictions for movies in the testing and training set. Store these as preds_test and preds_train respectively.


preds_train <- predict(mod2, newdata = movies_train)

pred_test <- predict(mod2, newdata = movies_test)

3b) Calculate root mean squared error in the testing and training sets.


get_rmse <- function(true_values, predictions){
  sqrt(mean( (exp(true_values) - exp(predictions)) ^2 ) )
  }

# training MSE

get_rmse(movies_train$log_gross, preds_train)
## [1] 76006108


get_rmse(movies_test$log_gross, pred_test)
## [1] 68985945


# Why exponentiate the values?  It's because we have log values for gross and budget so we want to exponentiate the variables.  

3c) Calculating mean absolute error in the testing and training sets.


get_mae <- function(true_values, predictions){
  mean(abs(true_values - predictions))
}

# training MSE
get_mae(movies_train$log_gross, preds_train)
## [1] 1.178024

# trsting MSE
get_mae(movies_test$log_gross, pred_test)
## [1] 1.219663

3d) Create a results data frame that contains both the model predicted and actual values of movie gross for the test and training sets.


results <- 
  data.frame(
    'preds' = c(exp(pred_test), exp(preds_train)),
    'true' = c(movies_test$gross, movies_train$gross),
    'type' = c(rep("Test", nrow(movies_test)), rep("Train", nrow(movies_train)))
    ) %>% 
  mutate(preds = preds /1e6,
         true = true / 1e6)

3e) Plot the predicted against true values of movie gross.


ggplot(results, aes(y = preds, x = true)) + geom_point(alpha = 1/20) +
  xlim(0,300) + ylim(0,300) +
  geom_abline(aes(intercept = 0, slope = 1), linetype = "dashed") +
  facet_wrap(~ type) +
  theme_minimal() + 
  labs(x = "True Movie Gross ($M USD)", 
       y = "Predicted Movie Gross ($M USD)")