본문 바로가기
  • Let's go grab a data
Data/R

html 파싱하기 rvest 패키지

by pub-lican-ai 2017. 4. 14.
반응형

html 파싱하기 rvest 패키지


install.packages("rvest")

library(rvest)

url = "http://music.naver.com/listen/top100.nhn?domain=DOMESTIC&duration=1h";

download.file(url,destfile = "navermusic.html",quiet = T);

naver_music <- read_html("navermusic.html");


ranking <- naver_music %>% 

  html_nodes("._tracklist_move .ranking") %>% 

  html_text() %>% 

  as.numeric()


ranking


gap <- naver_music %>% 

  html_nodes("._tracklist_move .change") %>% 

  html_text() %>% 

  gsub(pattern="\r\n||\t||\n", replacement="")


gap


title <- naver_music %>% 

  html_nodes("._tracklist_move ._title span") %>% 

  html_text() %>% 

  gsub(pattern="\r\n||\t||", replacement="") %>% 

  trimws()


title


artist <- naver_music %>% 

  html_nodes("._tracklist_move ._artist a") %>% 

  html_text() %>% 

  gsub(pattern="\r\n||\t||", replacement="") %>% 

  trimws()


artist


musicTop50 <- data.frame(ranking,gap,title,artist, stringsAsFactors=F)

musicTop50


library("rvest")

ranking100 <- data.frame()

for(i in 1:10){

  page <-i;

  url = paste("https://www.usnews.com/education/best-global-universities/rankings?page=",toString(page),sep = "");

  

  url

  download.file(url,destfile = "usnews.html", quiet = T);

  Global_Universities_Rankings <- read_html("usnews.html");

  

  ranking <- Global_Universities_Rankings %>%

    html_nodes(".sep .rankscore-bronze") %>%

    html_text()

  

  ranking

  

  univ_names <- Global_Universities_Rankings %>%

    html_nodes(".sep .h-taut a") %>%

    html_text() %>%

    trimws()

  

  univ_names

  

  countries <- Global_Universities_Rankings %>%

    html_nodes(".sep .t-taut") %>%

    html_text() %>%

    trimws()

  

  countries

  

  scores <- Global_Universities_Rankings %>%

    html_nodes(".sep .thumb-right") %>%

    html_text() %>%

    trimws()

  

  scores

  

  ranking10 <- data.frame(ranking, univ_names, countries, scores, stringsAsFactors=F)

  

  ranking10

  

  ranking100 <- rbind(ranking100,ranking10);

}


ranking100




반응형