# Date:     1/14, 2017, and 4/25, 2017, 11/23, 2020
# Topics:   read data from website: US Election 2016 (English Version)
# File:     https://goo.gl/25o9dv
# https://en.wikipedia.org/wiki/2016_United_States_presidential_election
# http://homepage.ntu.edu.tw/~fengli/Teaching/Computer/ReadFile/USA_2016_en.htm
#
# Format:   html
# Content:  2016年美國總統選舉, ..., 競選資金
# Name: Feng-Li Lian
#
#
# here is the begining!!!!!
# load packages

# install.packages( "RCurl" )
library(RCurl)

# install.packages( "XML" )
library(XML)

# set up the link to the website and obtain the content of the website
# url <- "https://en.wikipedia.org/wiki/2016_United_States_presidential_election"

url <- "http://homepage.ntu.edu.tw/~fengli/Teaching/Computer/ReadFile/USA_2016_en.htm"


# download the URL

html <- getURL( url )

# html <- getURL( url, encoding="ASCII" )
# html <- getURL( url, encoding="SHIFT-JIS" )
# html <- getURL( url, encoding="UTF-8" )
# html <- getURL( url, encoding="big5" )
# html <- getURL( url, encoding="UTF-8" )

# html

# Parse an Parses an XML or HTML file or string containing XML/HTML content, 
# and generates an R structure representing the XML/HTML tree. 
doc <- htmlParse(html, asText=TRUE)

# doc

# Find matching nodes in an internal XML tree/DOM
plain.text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)

# plain.text

# compute the length of the mathing nodes
NN <- length( plain.text )
NN

# get the 100-th element of the website file data
plain.text[200]

# do some comparison, check

plain.text[199] == "Democratic Party"
plain.text[200] == "Democratic Party"
plain.text[201] == "Democratic Party"

# find the area of 競選資金 by mathing the subjest keyword 
# the data is between 競選資金 and 辯論

# find the location of 競選資金 Campaign finance

di1 <- matrix(0,10)
j <- 1
for ( i in 1:NN){
  if( plain.text[i] == "Campaign finance" ){
    di1[j] <- i
    j <- j + 1
  }
}

di1
j

# find the location of  Voting rights

di2 <- matrix(0,10)
j <- 1
for ( i in 1:NN){
  if( plain.text[i] == "Voting rights" ){
    di2[j] <- i
    j <- j + 1
  }
}
di2
j



# set up a set of keyword texts and find the length of the set

key.text <- plain.text[ di1[1]: di2[1] ]
key.text

NK <- length( key.text )
NK

# obtain the data within this area

# set up a 10x8 matrxi to store the numbers
data.main <- matrix(0, nrow = 10, ncol = 8)
data.main.num <- matrix(0, nrow = 10, ncol = 8)

# match the keywords: name of candidates: 希拉莉·克林頓, 唐納·川普, 加里·約翰遜
# Hillary Clinton, Donald Trump

for( i in 1:NK ){
  if( key.text[i] == "Hillary \n      Clinton" ){
    for( k in 1:8 ){
      data.main[1,k] <- key.text[ i + 2 + k*2 ]
    }
  }
  if( key.text[i] == "Donald \n      Trump" ){
    for( k in 1:8 ){
      data.main[2,k] <- key.text[ i + 2 + k*2 ]
    }
  }
  if( key.text[i] == "Gary \n      Johnson" ){
    for( k in 1:8 ){
      data.main[3,k] <- key.text[ i + 2 + k*2 ]
    }
  }
}

data.main

# conver the data into numeric format
for( i in 1:10 ){
  for( j in 1:8 ){
    data.main.num[i,j] <- as.numeric(gsub('[$,]', '', data.main[i,j]))
  }
}

data.main.num