# load all liabraries
library('tidyverse')
library('rsample')
library('partykit')
library('tidyverse')
library('PerformanceAnalytics')
library('rpart')
library('rpart.plot')
library('randomForest')
library('randomForestExplainer')
library('ggridges')
1a) Clean the data to generate data which we can use for modeling.
# tidyverse
options(scipen = 50)
set.seed(1861)
movies <- read.csv(here::here("datasets", "IMDB_movies.csv"))
movies_clean <- movies %>%
# filter out some movies with missing values or
# outliers
filter(budget < 4e+08, content_rating != "", content_rating !=
"Not Rated", plot_keywords != "") %>%
# clean genre and plot
mutate(genre_main = unlist(map(strsplit(as.character(.$genres),
"\\|"), 1)), plot_main = unlist(map(strsplit(as.character(.$plot_keywords),
"\\|"), 1)), grossM = gross/1e+06, budgetM = budget/1e+06) %>%
mutate(genre_main = fct_lump(genre_main, 7), plot_first = fct_lump(plot_main,
20), content_rating = fct_lump(content_rating, 4), country = fct_lump(country,
8), language = fct_lump(language, 4), cast_total_facebook_likes000s = cast_total_facebook_likes/1000) %>%
drop_na()
top_director <- movies_clean %>%
group_by(director_name) %>%
summarize(num_films = n()) %>%
top_frac(0.1) %>%
mutate(top_director = 1) %>%
select(-num_films)
movies_clean <- movies_clean %>%
left_join(top_director, by = "director_name") %>%
mutate(top_director = replace_na(top_director, 0)) %>%
select(-c(director_name, actor_2_name, gross, genres, actor_1_name,
movie_title, actor_3_name, plot_keywords, movie_imdb_link,
budget, color, aspect_ratio, plot_main, actor_3_facebook_likes,
actor_2_facebook_likes, color, num_critic_for_reviews,
num_voted_users, num_user_for_reviews, actor_2_facebook_likes))
sapply(movies_clean %>%
select_if(is.factor), table)
## $country
##
## Australia Canada France Germany Hong Kong Spain UK
## 39 57 97 79 13 19 315
## USA Other
## 2973 119
##
## $language
##
## English French Mandarin Spanish Other
## 3575 32 13 22 69
##
## $content_rating
##
## G PG PG-13 R Other
## 87 565 1305 1693 61
##
## $genre_main
##
## Action Adventure Biography Comedy Crime Drama Horror
## 952 367 204 979 250 652 163
## Other
## 144
##
## $plot_first
##
## 1950s 1970s actor african american
## 18 18 24 24
## alien apartment army assassin
## 69 19 20 26
## baby bank bar basketball
## 22 19 18 18
## battle beach best friend box office flop
## 26 19 32 28
## boy christmas cia college
## 36 18 19 22
## death friend Other
## 40 21 3155
# r sample
movies_split <- initial_split(movies_clean, prop = 0.75)
movies_train <- training(movies_split)
movies_test <- testing(movies_split)
1b) Produce a ridgeline plot showing the distribution of grossM for varying levels of plot_first. Which plot keywords are associated with the most blockbusters (gross above $300M)?
The plot key words that are associated with the most blockbusters (gross above $300 million) are “death”, “college”, “battle”, and “assassin”.
ggplot(data = movies_clean, aes(x = grossM, y = plot_first)) +
geom_density_ridges(scale = 3, fill = "lightblue") +
xlim(100,800)
1c) Using ggplot2, produce two additional interesting visualizations using some of the new variables: plot_first, cast_total_facebook_likes000s, and top_director.
The first visualization is the relationship between plot key words and cast_total_facebook_likes000s where the binary variable of top_director is sorted by color.
The second visualization is log(grossM) vs log(cast_total_facebook_likes000s) colored by top director again.
movies_clean$top_director <- as.factor(movies_clean$top_director)
ggplot(movies_clean,
aes(x = cast_total_facebook_likes000s, y = plot_first, color = top_director)) +
geom_point() +
labs(x = "Cast Total Facebook Likes (Thousands 000s)", y = "Plot Key Words")
ggplot(movies_clean,
aes(x = log(cast_total_facebook_likes000s), y = log(grossM), color = top_director)) +
geom_point() +
labs(x = "log Cast Total Facebook Likes (Thousands 000s)", y = "log Gross (Millions)")
1d) In your own words, describe what the parameter mtry controls in the function randomForest.
Mtry is a tuning parameter for random forests. Mtry is the number of variables randomly selected as candidates at each node split. Random forests use a random selection of m predictors chosen as a split candidate each time a split in the tree in considered.
1e) What type of model results if we set mtry = P the number of variables in the dataset?
The model that results if we set mtry = P the number of variables in the dataset is a decision tree.
1f) What does the parameter ntree in the randomForest function control?
Ntree specifies the number of trees in a random forest. Random forests are a form of bagging that uses many decision trees and Ntree is the number of trees used to the total random forest model
1g) In your own words, describe why setting mtry less than P often results in a better model fit.
Setting mtry less than P often results in a better model fit because when mtry is high, there’s a high chance the model is subject to over fit.
1h) While we should cross-validate to select the optimal mtry and ntree, if we do not have time to cross-validate (or are too lazy to do so) a good rule of thumb is to set mtry equal to the square root of the number of predictors. Given this, what value should we assign to mtry? Hint, it’s not 4 or 5.
If we are too lazy to do so, the default would be to take the square root of p. However, this is for classification random forest. Since profit is a continuous variable and we would be using random forest for regression, the default is p/3 so I would set mtry = 6.
1i) What is a hyperparameter and what are the hyperparameters in a random forest model?
A hyperparameter is a metaparameter that we use to ‘tune’ the model. Mtry and Ntree are both hyperparameters in the random forest model.
1j) Fit a random forest model using the movies_train dataset with 200 trees and the mtry suggested above. Store this object as rf_fit_[your_name], where [your_name] is replaced with your own name. Be sure to select the option importance = TRUE when fitting the model.
movies_train <- movies_train %>%
mutate(profit = grossM - budgetM)
rf_fit_nicholas_kondo <- randomForest(profit ~ .,
data = movies_train,
type = classification,
mtry = 6,
na.action = na.roughfix,
ntree = 600,
importance = TRUE)
print(rf_fit_nicholas_kondo)
##
## Call:
## randomForest(formula = profit ~ ., data = movies_train, type = classification, mtry = 6, ntree = 600, importance = TRUE, na.action = na.roughfix)
## Type of random forest: regression
## Number of trees: 600
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 274.6752
## % Var explained: 90.82
1k) How does our model improve as we increase the number of trees? Use plot over the fitted random forest object to determine how many trees should be used.
Our model improves as we increase the number of trees substantially before plateauing.
plot(rf_fit_nicholas_kondo)
1l) Generate test predictions, out of bag predictions, and and in-bag predictions. Calculate mean squared error for the in-bag, test and out of bag sets. Comment on the performance in 1-2 sentences.
The predictions can be from the in bag or out of bag, and if we know about the out of bag error then we know about the out of bag predictions.
The out of bag error in this case is mse because it is a regression. We can see that the oob error decreases as mtry increases.
rf_mods <- list()
oob_err <- NULL
test_err <- NULL
for(mtry in 1:9){
rf_fit_nicholas_kondo <- randomForest(profit ~
.,
data = movies_train,
mtry = mtry,
na.action = na.roughfix,
ntree = 600)
oob_err[mtry] <- rf_fit_nicholas_kondo$mse[600]
cat(mtry," ")
}
## 1 2 3 4 5 6 7 8 9
results_DF <- data.frame(mtry = 1:9, oob_err)
ggplot(results_DF, aes(x = mtry, y = oob_err)) + geom_point() + theme_minimal() + xlim(1,9)
2a) What are the top 5 most important variables for the random forest model? Use varImpPlot over the fitted random forest object, using the option type = 1 to discover the most important variables.
The top 5 variables are grossM, budgetM, imdb_score, genre_main, and title_year.
*Get an error every time I set type =1
varImpPlot(rf_fit_nicholas_kondo)
2b) This is a distribution of the minimal depth. The depth is the number of trees each variable had and we can see how each variable plays a role in the random forest. We are obtaining a plot for the top ten variables according to mean minimal death which is calculated using top trees.
plot_min_depth_distribution(rf_fit_nicholas_kondo)
2c) We are plotting the prediction of our forest on a grid of valies for the components of each interaction. We fed this function the forest, training data, and variables such as budgetM, imdb_score, and title_year to use on the x and y axis. In our first plot below we can see a minor interaction. The predicted profit is highest when imdb score is high and budgetM is low. There is not much interaction in the second plot.
plot_predict_interaction(rf_fit_nicholas_kondo,
movies_train,
'budgetM',
'imdb_score')
plot_predict_interaction(rf_fit_nicholas_kondo,
movies_train,
'title_year',
'budgetM')