Visualize the rating and votes of 250 IMDb movies

摘自 http://www.r-bloggers.com/top-250-movies-at-imdb/

################################
## 250 IMDb movies, visualize the rating and votes
################################



## We will use the XML library to retrieve the page from IMDb and parse out the appropriate table.

library(XML)
url <- "http://www.imdb.com/chart/top"
best.movies <- readHTMLTable(url, which = 2, stringsAsFactors = FALSE)
head(best.movies)

## the rank column is redundant since the same information is captured by the row labels
best.movies[, 1] <- NULL
head(best.movies)

##  the years are bundled up with the titles;
##  the rating data are strings;
##  the votes data are also strings and have embedded commas.
## All of these problems are easily fixed though.
pattern = "(.*) \\((.*)\\)$"
best.movies = transform(best.movies,
                        Rating = as.numeric(Rating),
                        Year   = as.integer(substr(gsub(pattern, "\\2", Title), 1, 4)),
                        Title  = gsub(pattern, "\\1", Title),
                        Votes  = as.integer(gsub(",", "", Votes))
                        )
best.movies = best.movies[, c(4, 2, 3, 1)]
head(best.movies)


## Finally, to gain a little perspective on the relationship between the release year, 
## votes and rating we can put together a simple bubble plot.
library(ggplot2)
ggplot(best.movies, aes(x = Year, y = Rating)) +
       geom_point(aes(size = Votes), alpha = 0.5, position = "jitter", color = "darkgreen") +
       scale_size(range = c(3, 15)) 
#+      theme_classic()

 

 

 

 

 

posted @ 2013-10-07 00:12  BinbinChen  阅读(238)  评论(0编辑  收藏  举报