Machine Learning for Managers


# loading libraries in this chunk
library(tidyverse)
library(dplyr)
library(here)
library(fs)
library(dplyr)
library(ggplot2)
library(ggrepel)

Question 1: Loading Files and Working With Directories.

1a) What does the function dir_ls() do?

The dir_ls() function lists the files in the current working directory.

dir_ls()
## Problem Set 1.Rmd             Problem Set 1.html            
## Problem Set 1.nb.html         Problem-Set-1.Rmd             
## Problem-Set-1.html            Problem-Set-1.knit.md         
## Problem-Set-1.log             Problem-Set-1.tex             
## Problem-Set-1_cache           Problem-Set-1_files           
## ProblemSet1.Rmd               ProblemSet1.nb.html           
## ProblemSet1_cache             kondo_problemset1_696.Rmd     
## kondo_problemset1_696.nb.html

1b) Download the IMDB_movies dataset and read the dataset into R. Create an object called movies that holds the dataset.

movies <- read.csv(here("datasets","IMDB_movies.csv"))

1c) What are the dimensions of the dataset? There are 3889 rows and 28 columns.


dim(movies)
## [1] 3889   28

1d) What are the names of the variables in the dataset?


names(movies)
##  [1] "movie_title"               "director_name"            
##  [3] "gross"                     "budget"                   
##  [5] "country"                   "title_year"               
##  [7] "imdb_score"                "language"                 
##  [9] "duration"                  "genres"                   
## [11] "content_rating"            "aspect_ratio"             
## [13] "color"                     "plot_keywords"            
## [15] "movie_facebook_likes"      "director_facebook_likes"  
## [17] "cast_total_facebook_likes" "facenumber_in_poster"     
## [19] "actor_1_facebook_likes"    "actor_1_name"             
## [21] "actor_2_facebook_likes"    "actor_2_name"             
## [23] "actor_3_facebook_likes"    "actor_3_name"             
## [25] "num_user_for_reviews"      "num_critic_for_reviews"   
## [27] "num_voted_users"           "movie_imdb_link"

1e) How can we determine the data types of each variable? Which variables are characters?

movie_title, director_name, country, language, genres, content_rating, color, plot_keywords, actor_1_name, actor_2_name, actor_3_name, movie_imdb_link


movies %>% glimpse()
## Rows: 3,889
## Columns: 28
## $ movie_title               <chr> "Avatar", "Pirates of the Caribbea…
## $ director_name             <chr> "James Cameron", "Gore Verbinski",…
## $ gross                     <int> 760505847, 309404152, 200074175, 4…
## $ budget                    <dbl> 237000000, 300000000, 245000000, 2…
## $ country                   <chr> "USA", "USA", "UK", "USA", "USA", …
## $ title_year                <int> 2009, 2007, 2015, 2012, 2012, 2007…
## $ imdb_score                <dbl> 7.9, 7.1, 6.8, 8.5, 6.6, 6.2, 7.8,…
## $ language                  <chr> "English", "English", "English", "…
## $ duration                  <int> 178, 169, 148, 164, 132, 156, 100,…
## $ genres                    <chr> "Action|Adventure|Fantasy|Sci-Fi",…
## $ content_rating            <chr> "PG-13", "PG-13", "PG-13", "PG-13"…
## $ aspect_ratio              <dbl> 1.78, 2.35, 2.35, 2.35, 2.35, 2.35…
## $ color                     <chr> "Color", "Color", "Color", "Color"…
## $ plot_keywords             <chr> "avatar|future|marine|native|parap…
## $ movie_facebook_likes      <int> 33000, 0, 85000, 164000, 24000, 0,…
## $ director_facebook_likes   <int> 0, 563, 0, 22000, 475, 0, 15, 0, 2…
## $ cast_total_facebook_likes <int> 4834, 48350, 11700, 106759, 1873, …
## $ facenumber_in_poster      <int> 0, 0, 1, 0, 1, 0, 1, 4, 3, 0, 0, 1…
## $ actor_1_facebook_likes    <int> 1000, 40000, 11000, 27000, 640, 24…
## $ actor_1_name              <chr> "CCH Pounder", "Johnny Depp", "Chr…
## $ actor_2_facebook_likes    <int> 936, 5000, 393, 23000, 632, 11000,…
## $ actor_2_name              <chr> "Joel David Moore", "Orlando Bloom…
## $ actor_3_facebook_likes    <int> 855, 1000, 161, 23000, 530, 4000, …
## $ actor_3_name              <chr> "Wes Studi", "Jack Davenport", "St…
## $ num_user_for_reviews      <int> 3054, 1238, 994, 2701, 738, 1902, …
## $ num_critic_for_reviews    <int> 723, 302, 602, 813, 462, 392, 324,…
## $ num_voted_users           <int> 886204, 471220, 275868, 1144337, 2…
## $ movie_imdb_link           <chr> "http://www.imdb.com/title/tt04995…

movies %>% 
  select_if(is.character) %>% 
  names()
##  [1] "movie_title"     "director_name"   "country"        
##  [4] "language"        "genres"          "content_rating" 
##  [7] "color"           "plot_keywords"   "actor_1_name"   
## [10] "actor_2_name"    "actor_3_name"    "movie_imdb_link"

1f) Use the slice function to show the first 10 rows of the dataset.


movies %>% 
  slice(1:10)

1g) Use the arrange function to order the dataset in descending order by gross or total box office revenue in USD. Use the slice function to show the ten movies with the largest box office revenue in the dataset. Which director or directors have multiple movies among the top 10 highest grossing films of all time.

James Cameron, Joss Whedon, Christopher Nolan, George Lucas.


movies %>% 
  arrange(desc(gross)) %>% 
  slice(1:10)

1h) Remove the variable movie_imdb_link. Store the smaller dataset as movies_sub. Run the dimension command to ensure that movies_sub has one fewer variable than movies.


movies_sub <- movies %>% 
  select(-movie_imdb_link)
  
dim(movies_sub)
## [1] 3889   27

1i) How many movies in the dataset had a budget in excess of $200M USD. (Using original movies dataset).


movies %>% 
  filter(budget > 200000000)

1j) Create two new variables, budgetM and grossM that report budget and gross in millions of USD.


movies <-
movies %>% 
  mutate(budgetM = (budget/1000000)) %>% 
  mutate(grossM = (gross/1000000))

1k) Use the glimpse command over the data set to ensure the variables have been transformed appropriately.


glimpse(movies)
## Rows: 3,889
## Columns: 30
## $ movie_title               <chr> "Avatar", "Pirates of the Caribbea…
## $ director_name             <chr> "James Cameron", "Gore Verbinski",…
## $ gross                     <int> 760505847, 309404152, 200074175, 4…
## $ budget                    <dbl> 237000000, 300000000, 245000000, 2…
## $ country                   <chr> "USA", "USA", "UK", "USA", "USA", …
## $ title_year                <int> 2009, 2007, 2015, 2012, 2012, 2007…
## $ imdb_score                <dbl> 7.9, 7.1, 6.8, 8.5, 6.6, 6.2, 7.8,…
## $ language                  <chr> "English", "English", "English", "…
## $ duration                  <int> 178, 169, 148, 164, 132, 156, 100,…
## $ genres                    <chr> "Action|Adventure|Fantasy|Sci-Fi",…
## $ content_rating            <chr> "PG-13", "PG-13", "PG-13", "PG-13"…
## $ aspect_ratio              <dbl> 1.78, 2.35, 2.35, 2.35, 2.35, 2.35…
## $ color                     <chr> "Color", "Color", "Color", "Color"…
## $ plot_keywords             <chr> "avatar|future|marine|native|parap…
## $ movie_facebook_likes      <int> 33000, 0, 85000, 164000, 24000, 0,…
## $ director_facebook_likes   <int> 0, 563, 0, 22000, 475, 0, 15, 0, 2…
## $ cast_total_facebook_likes <int> 4834, 48350, 11700, 106759, 1873, …
## $ facenumber_in_poster      <int> 0, 0, 1, 0, 1, 0, 1, 4, 3, 0, 0, 1…
## $ actor_1_facebook_likes    <int> 1000, 40000, 11000, 27000, 640, 24…
## $ actor_1_name              <chr> "CCH Pounder", "Johnny Depp", "Chr…
## $ actor_2_facebook_likes    <int> 936, 5000, 393, 23000, 632, 11000,…
## $ actor_2_name              <chr> "Joel David Moore", "Orlando Bloom…
## $ actor_3_facebook_likes    <int> 855, 1000, 161, 23000, 530, 4000, …
## $ actor_3_name              <chr> "Wes Studi", "Jack Davenport", "St…
## $ num_user_for_reviews      <int> 3054, 1238, 994, 2701, 738, 1902, …
## $ num_critic_for_reviews    <int> 723, 302, 602, 813, 462, 392, 324,…
## $ num_voted_users           <int> 886204, 471220, 275868, 1144337, 2…
## $ movie_imdb_link           <chr> "http://www.imdb.com/title/tt04995…
## $ budgetM                   <dbl> 237.0, 300.0, 245.0, 250.0, 263.7,…
## $ grossM                    <dbl> 760.50585, 309.40415, 200.07417, 4…

Question 2: Basic Plotting with ggplot2

2a) Create a scatter plot of IMDB score on the x axis and movie gross on the y-axis


# code for 2a

data(movies)
ggplot(data = movies, aes(x = imdb_score, y = gross)) + geom_point()

2b) This looks okay, but there are so many points it is hard to see how much underlying data each point represents. Let’s make it more transparent by using alpha =.

data(movies)
ggplot(data = movies, aes(x = imdb_score, y = gross)) + geom_point(alpha = 1/10)

2c) It would be more readable if the units of the y-axis were in millions of dollars versus just USD. Use the mutate() function to create a new variable gross_M which lists gross in millions of dollars. Store this new variable in the movies dataset.


data(movies)

movies <- 
  movies %>% 
  mutate(gross_M = (gross/1000000))

ggplot(data = movies, aes(x = imdb_score, y = gross_M)) + geom_point(alpha = 1/10)

2d) Create a scatter plot of imdb_score against gross_M and use the geom_smooth function to make a smoothing line. Is there a relationship between movie gross and IMDB score?

There is a relationship between movie gross and IMDB score.


data(movies)

ggplot(data = movies, aes(x = imdb_score, y = gross_M)) + geom_smooth(method = "lm")

2e) Only include movies by the director Justin Lin. Plot these movies as points using the geom_point() function. Change the shape of the points to any shape other than the default shape.


data(movies)

movies_justinlin <-
movies %>% 
  filter(director_name == "Justin Lin")

ggplot(data = movies_justinlin, aes(x = imdb_score, y = gross_M)) + geom_point(shape = 2)

2f) This looks okay, but the text is clipped on the right and left of the plot. Change the range of the x axis to [5,9] and the y axis range to [0,250].


data(movies)

movies_justinlin <-
movies %>% 
  filter(director_name == "Justin Lin")

ggplot(data = movies_justinlin, aes(x = imdb_score, y = gross_M), xlim ) + 
  geom_point(shape = 2) +
  xlim(5,9) +
  ylim(0,250)

2g) Install the package ggrepel and load it using the library function. Add to the plot geom_text_repel(aes(label = movie_title)) to add labels to the scatter plot showing the names of the movie titles.


data(movies)

movies_justinlin <-
movies %>% 
  filter(director_name == "Justin Lin")

ggplot(data = movies_justinlin, aes(x = imdb_score, y = gross_M), ) + 
  geom_point(shape = 2) +
  xlim(5,9) +
  ylim(0,250) +
  geom_text_repel(aes(label = movie_title))

2h) Add labels


data(movies)

movies_justinlin <-
movies %>% 
  filter(director_name == "Justin Lin")

ggplot(data = movies_justinlin, aes(x = imdb_score, y = gross_M), ) + 
  geom_point(shape = 2) +
  xlim(5,9) +
  ylim(0,250) +
  geom_text_repel(aes(label = movie_title)) +
  labs(x = "imdb score", y = "gross (in millions)")

2i) Add a theme to the graph and add the option base_size = 14


data(movies)

movies_justinlin <-
movies %>% 
  filter(director_name == "Justin Lin")

ggplot(data = movies_justinlin, aes(x = imdb_score, y = gross_M), ) + 
  geom_point(shape = 2) +
  xlim(5,9) +
  ylim(0,250) +
  geom_text_repel(aes(label = movie_title)) +
  labs(x = "imdb score", y = "gross (in millions)") +
  theme_light(base_size = 14)

Machine Learning for Managers

Loading Files and Basic Plotting with GGplot

Nicholas Kondo

Question 1: Loading Files and Working With Directories.

Question 2: Basic Plotting with ggplot2