pars-livelib/book.py at master · jukka413/pars-livelib · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re

from bs4 import BeautifulSoup
from csv_writer import CsvWriter
from get_html import GetHTML
from get_books import GetBook


def get_books_pages(ll_login):
    url_base = 'https://livelib.ru/reader/'
    url_text = ll_login + '/wish/~'

    list_of_books_is_empty = True

    books = []
    webpage = 1

    while list_of_books_is_empty:
        url = url_base + url_text + str(webpage)
        html = GetHTML.get_html(url)
        # print(html)
        soup = BeautifulSoup(html, 'lxml')

        checkList = soup.find('div', class_='book-data')
        # print(checkList)
        if checkList:  # and webpage < 3:
            books = GetBook.get_books(html, books)
            webpage += 1
            parse_books_info(books)
            books = []
        else:
            list_of_books_is_empty = False

    return books


def parse_books_info(books):
    for i in range(len(books)):
        link = 'https://livelib.ru/' + books[i]
        print(link)
        print('Скачано ', i + 1, ' из ', len(books))
        html = GetHTML.get_html(link)

        soup = BeautifulSoup(html, 'lxml')

        name = soup.find('title', id="title-head")
        name = str(name.text)

        author = soup.find(href=re.compile("author/"))
        if author is None:
            author = 'No Author Info'
        else:
            author = soup.find(href=re.compile("author/")).text

        pages = soup.find_all('div', class_='bc-info__wrapper')  # id="5",
        page = ""
        publication_date = 'No date info'
        if pages is None:
            page = 'No Pages Info'
        else:
            try:
                page_tmp = pages[1]
            except IndexError:
                continue
            page_tmp = page_tmp.find_all('p')
            for i in range(len(page_tmp)):
                page_tmp_l = str(page_tmp[i])
                page_tmp_l = page_tmp_l.replace('  ', '').replace('\n', ' ').replace('<br/>', ' ')
                page_tmp_l = page_tmp_l[page_tmp_l.find('>') + 1:page_tmp_l.find('</')]
                if page_tmp_l[0:4] in ('ISBN', 'Язык', ' Тег', 'Том:', 'Форм', ' Жан'):
                    continue
                if page_tmp_l[0:3] == 'Год':
                    publication_date = page_tmp_l
                    continue
                if i == len(page_tmp) - 1:
                    page = page + page_tmp_l
                else:
                    page = page + page_tmp_l + ", "
            if page == "":
                page = "No Pages Info"

        genre = ""
        genre_list = list()
        genres = soup.find_all(href=re.compile("genre"))
        if not genres:
            genre = 'No Genre'
        else:
            for i in range(len(genres)):
                genre_tmp = str(genres[i])
                genre_tmp = genre_tmp[genre_tmp.find('>') + 1:genre_tmp.find('</')]
                if genre_tmp == 'Жанры':
                    continue
                if genre_tmp in genre_list:
                    continue
                else:
                    genre_list.append(genre_tmp)
                if i == len(genres) - 1:
                    genre = genre + genre_tmp
                else:
                    genre = genre + genre_tmp + ", "
            if genre == "":
                genre = 'No Genre'

        isbn = soup.find('span', itemprop="isbn")
        if isbn is None:
            isbn = 'No ISBN'
        else:
            isbn = str(isbn.text)

        data = [[name, author, page, genre, publication_date, isbn]]
        filename = 'wish_list.csv'
        CsvWriter.csv_write(filename, data)


if __name__ == '__main__':
    ll_login = 'jukka413'  # login пользователя livelib
    links = get_books_pages(ll_login)
    print('Load completed')