-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBasicScraping.R
More file actions
98 lines (66 loc) · 2.09 KB
/
BasicScraping.R
File metadata and controls
98 lines (66 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# CRAWLERS
# Obtenemos los enlaces proporcionados por el crawler
# get all href attributes from html link tag
'<div class="offer-name">
<a href="http://www.somesite.com" itemprop="name">Fancy Product</a>
</div>' -> xData
library(httr)
library(xml2)
parsedHtml <- xmlParse(xData)
enlaces <- xpathSApply(parsedHtml, "//a", xmlGetAttr, "href")
enlaces
Products <- xpathSApply(parsedHtml, "//div[@class='offer-name']", xmlValue)
Products
hrefs <- xpathSApply(parsedHtml, "//div/a", xmlGetAttr, "href")
hrefs
# Iteramos sobre los enlaces URL relativa/absoluta
# url relativa o absoluta
url <- "http://www.e-katec.com"
url1 <- "/video1.mp4"
?grep
grep("http", url)
grep("http", url1)
# URL Completa
a <- length (grep("http", url )) > 0 | length (grep("https", url )) > 0 # Absoluta
a
a1 <- length (grep("http", url1 )) > 0 | length (grep("https", url1 )) > 0 # Relativa
a1
#URL sin protocolos
b <- length(grep("www.e-katec.com", url)) > 0 | a # Absoluta
b
b1 <- length(grep("www.e-katec.com", url1)) > 0 | a1 # Relativo
b1
library(httr)
# Si la URL es relativa, debemos completarla para que GET sepa en qu? dominio debe buscar
handler <- handle("http://www.e-katec.com");
html <- GET(handle = handler, url = url);
html
# Si la URL es absoluta, podemos hacer GET directamente
# download url content
html <- GET(url=url);
html
# wait 2 seconds
Sys.sleep(2)
# SCRAPPING
library(XML)
library(httr)
html <- GET("http://www.e-katec.com")
content <- content(html, as = "text")
content
parsedHtml <- htmlParse(content, asText = TRUE)
parsedHtml
title <- xpathSApply(parsedHtml, "//title", xmlValue)
title
texts <- xpathSApply(parsedHtml, "//p", xmlValue)
texts
links_text <- xpathSApply(parsedHtml, "//a", xmlValue)
links_text
links_url <- xpathSApply(parsedHtml, "//a", xmlGetAttr, 'href')
links_url
images_url <- xpathSApply(parsedHtml, "//img", xmlGetAttr, 'src')
images_url
# Prueba url imagenes relativa o no
imagen_prueba <- images_url[1]
a <- length (grep("http", imagen_prueba )) > 0 | length (grep("https", imagen_prueba )) > 0
if(a==0) { 'URL RELATIVA'
} else { 'URL ABSOLUTA'}