# Date: 1/14, 2017, and 4/25, 2017, 11/23, 2020 # Topics: read data from website: US Election 2016 (English Version) # File: https://goo.gl/25o9dv # https://en.wikipedia.org/wiki/2016_United_States_presidential_election # http://homepage.ntu.edu.tw/~fengli/Teaching/Computer/ReadFile/USA_2016_en.htm # # Format: html # Content: 2016年美國總統選舉, ..., 競選資金 # Name: Feng-Li Lian # # # here is the begining!!!!! # load packages # install.packages( "RCurl" ) library(RCurl) # install.packages( "XML" ) library(XML) # set up the link to the website and obtain the content of the website # url <- "https://en.wikipedia.org/wiki/2016_United_States_presidential_election" url <- "http://homepage.ntu.edu.tw/~fengli/Teaching/Computer/ReadFile/USA_2016_en.htm" # download the URL html <- getURL( url ) # html <- getURL( url, encoding="ASCII" ) # html <- getURL( url, encoding="SHIFT-JIS" ) # html <- getURL( url, encoding="UTF-8" ) # html <- getURL( url, encoding="big5" ) # html <- getURL( url, encoding="UTF-8" ) # html # Parse an Parses an XML or HTML file or string containing XML/HTML content, # and generates an R structure representing the XML/HTML tree. doc <- htmlParse(html, asText=TRUE) # doc # Find matching nodes in an internal XML tree/DOM plain.text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue) # plain.text # compute the length of the mathing nodes NN <- length( plain.text ) NN # get the 100-th element of the website file data plain.text[200] # do some comparison, check plain.text[199] == "Democratic Party" plain.text[200] == "Democratic Party" plain.text[201] == "Democratic Party" # find the area of 競選資金 by mathing the subjest keyword # the data is between 競選資金 and 辯論 # find the location of 競選資金 Campaign finance di1 <- matrix(0,10) j <- 1 for ( i in 1:NN){ if( plain.text[i] == "Campaign finance" ){ di1[j] <- i j <- j + 1 } } di1 j # find the location of Voting rights di2 <- matrix(0,10) j <- 1 for ( i in 1:NN){ if( plain.text[i] == "Voting rights" ){ di2[j] <- i j <- j + 1 } } di2 j # set up a set of keyword texts and find the length of the set key.text <- plain.text[ di1[1]: di2[1] ] key.text NK <- length( key.text ) NK # obtain the data within this area # set up a 10x8 matrxi to store the numbers data.main <- matrix(0, nrow = 10, ncol = 8) data.main.num <- matrix(0, nrow = 10, ncol = 8) # match the keywords: name of candidates: 希拉莉·克林頓, 唐納·川普, 加里·約翰遜 # Hillary Clinton, Donald Trump for( i in 1:NK ){ if( key.text[i] == "Hillary \n Clinton" ){ for( k in 1:8 ){ data.main[1,k] <- key.text[ i + 2 + k*2 ] } } if( key.text[i] == "Donald \n Trump" ){ for( k in 1:8 ){ data.main[2,k] <- key.text[ i + 2 + k*2 ] } } if( key.text[i] == "Gary \n Johnson" ){ for( k in 1:8 ){ data.main[3,k] <- key.text[ i + 2 + k*2 ] } } } data.main # conver the data into numeric format for( i in 1:10 ){ for( j in 1:8 ){ data.main.num[i,j] <- as.numeric(gsub('[$,]', '', data.main[i,j])) } } data.main.num