-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbook.py
More file actions
118 lines (99 loc) · 3.68 KB
/
book.py
File metadata and controls
118 lines (99 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
from bs4 import BeautifulSoup
from csv_writer import CsvWriter
from get_html import GetHTML
from get_books import GetBook
def get_books_pages(ll_login):
url_base = 'https://livelib.ru/reader/'
url_text = ll_login + '/wish/~'
list_of_books_is_empty = True
books = []
webpage = 1
while list_of_books_is_empty:
url = url_base + url_text + str(webpage)
html = GetHTML.get_html(url)
# print(html)
soup = BeautifulSoup(html, 'lxml')
checkList = soup.find('div', class_='book-data')
# print(checkList)
if checkList: # and webpage < 3:
books = GetBook.get_books(html, books)
webpage += 1
parse_books_info(books)
books = []
else:
list_of_books_is_empty = False
return books
def parse_books_info(books):
for i in range(len(books)):
link = 'https://livelib.ru/' + books[i]
print(link)
print('Скачано ', i + 1, ' из ', len(books))
html = GetHTML.get_html(link)
soup = BeautifulSoup(html, 'lxml')
name = soup.find('title', id="title-head")
name = str(name.text)
author = soup.find(href=re.compile("author/"))
if author is None:
author = 'No Author Info'
else:
author = soup.find(href=re.compile("author/")).text
pages = soup.find_all('div', class_='bc-info__wrapper') # id="5",
page = ""
publication_date = 'No date info'
if pages is None:
page = 'No Pages Info'
else:
try:
page_tmp = pages[1]
except IndexError:
continue
page_tmp = page_tmp.find_all('p')
for i in range(len(page_tmp)):
page_tmp_l = str(page_tmp[i])
page_tmp_l = page_tmp_l.replace(' ', '').replace('\n', ' ').replace('<br/>', ' ')
page_tmp_l = page_tmp_l[page_tmp_l.find('>') + 1:page_tmp_l.find('</')]
if page_tmp_l[0:4] in ('ISBN', 'Язык', ' Тег', 'Том:', 'Форм', ' Жан'):
continue
if page_tmp_l[0:3] == 'Год':
publication_date = page_tmp_l
continue
if i == len(page_tmp) - 1:
page = page + page_tmp_l
else:
page = page + page_tmp_l + ", "
if page == "":
page = "No Pages Info"
genre = ""
genre_list = list()
genres = soup.find_all(href=re.compile("genre"))
if not genres:
genre = 'No Genre'
else:
for i in range(len(genres)):
genre_tmp = str(genres[i])
genre_tmp = genre_tmp[genre_tmp.find('>') + 1:genre_tmp.find('</')]
if genre_tmp == 'Жанры':
continue
if genre_tmp in genre_list:
continue
else:
genre_list.append(genre_tmp)
if i == len(genres) - 1:
genre = genre + genre_tmp
else:
genre = genre + genre_tmp + ", "
if genre == "":
genre = 'No Genre'
isbn = soup.find('span', itemprop="isbn")
if isbn is None:
isbn = 'No ISBN'
else:
isbn = str(isbn.text)
data = [[name, author, page, genre, publication_date, isbn]]
filename = 'wish_list.csv'
CsvWriter.csv_write(filename, data)
if __name__ == '__main__':
ll_login = 'jukka413' # login пользователя livelib
links = get_books_pages(ll_login)
print('Load completed')