Visualize the rating and votes of 250 IMDb movies
摘自 http://www.r-bloggers.com/top-250-movies-at-imdb/
################################ ## 250 IMDb movies, visualize the rating and votes ################################ ## We will use the XML library to retrieve the page from IMDb and parse out the appropriate table. library(XML) url <- "http://www.imdb.com/chart/top" best.movies <- readHTMLTable(url, which = 2, stringsAsFactors = FALSE) head(best.movies) ## the rank column is redundant since the same information is captured by the row labels best.movies[, 1] <- NULL head(best.movies) ## the years are bundled up with the titles; ## the rating data are strings; ## the votes data are also strings and have embedded commas. ## All of these problems are easily fixed though. pattern = "(.*) \\((.*)\\)$" best.movies = transform(best.movies, Rating = as.numeric(Rating), Year = as.integer(substr(gsub(pattern, "\\2", Title), 1, 4)), Title = gsub(pattern, "\\1", Title), Votes = as.integer(gsub(",", "", Votes)) ) best.movies = best.movies[, c(4, 2, 3, 1)] head(best.movies) ## Finally, to gain a little perspective on the relationship between the release year, ## votes and rating we can put together a simple bubble plot. library(ggplot2) ggplot(best.movies, aes(x = Year, y = Rating)) + geom_point(aes(size = Votes), alpha = 0.5, position = "jitter", color = "darkgreen") + scale_size(range = c(3, 15)) #+ theme_classic()