web_scraping/scraper_scrapy.py at master · facmartoni/web_scraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import scrapy
from scrapy.crawler import CrawlerProcess


class Spider12(scrapy.Spider):

    name = 'spider12'
    allowed_domains = ['pagina12.com.ar']
    custom_settings = {
        'FEED_FORMAT': 'json',
        'FEED_URI': 'resultados_scrapy.json',
        'DEPTH_LIMIT': 2,
        'FEED_EXPORT_ENCODING': 'utf-8',
    }
    start_urls = [
        'https://www.pagina12.com.ar/secciones/el-pais',
        'https://www.pagina12.com.ar/secciones/economia',
        'https://www.pagina12.com.ar/secciones/sociedad',
        'https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos',
        'https://www.pagina12.com.ar/secciones/ciencia',
        'https://www.pagina12.com.ar/secciones/el-mundo',
        'https://www.pagina12.com.ar/secciones/deportes',
        'https://www.pagina12.com.ar/secciones/contratapa',
    ]

    def parse(self, response):

        # Artículo promocionado
        nota_promocionada = response.xpath(
            '//div[@class="featured-article__container"]/h2/a/@href').get()
        if nota_promocionada is not None:
            yield response.follow(nota_promocionada, callback=self.parse_nota)

        # Listado de notas
        notas = response.xpath(
            '//ul[@class="article-list"]//li//a/@href').getall()
        for nota in notas:
            yield response.follow(nota, callback=self.parse_nota)

        # Link a la siguiente página
        next_page = response.xpath('//a[@class="pagination-btn-next"]/@href')
        if next_page:
            yield(response.follow(next_page, callback=self.parse))

    def parse_nota(self, response):

        # Parseo de la nota
        title = response.xpath('//div[@class="article-title"]/text()').get()
        date = response.xpath('//span[@pubdate="pubdate"]/@datetime').get()
        summary = response.xpath(
            '//div[@class="article-summary"]/text()').get()
        prefix = response.xpath('//div[@class="article-prefix"]/text()').get()
        body = "\n\n".join(
            response.xpath(
                '//div[@class="article-body"]//@div[@class="article-text"//p/text()]').getall()
        )
        author = response.xpath(
            '//div[@class="article-author"]//span//a/text()').get()

        yield {
            'url': response.url,
            'title': title,
            'date': date,
            'summary': summary,
            'prefix': prefix,
            'body': body,
            'author': author,
        }


process = CrawlerProcess()
process.crawl(Spider12)
process.start()