ipyparallel/Data_Parsing at master · Vladimir8243/ipyparallel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
%%time
#otfiltrovatj resultat, nado poluchitj spisok paragrofov i smotretj na sootvetsrtvija(interesuet soderzanie tega m iz м. Комсомольская
#filtruem i ubiraem ne relevantnqe objavlenija
#isli v nazvanii objavlenija imeetsja ukazanija na naw  poisk http zapros , togda mq ego berem
# nuzno dobavitj encoding utf-8 zdesj

#HTC	3 000  โฝ	ะผ.ย ะะฐะปัะถัะบะฐั	https://www.avito.ru/yaya/telefony/htc_1192666842

#
import requests
from bs4 import BeautifulSoup
import csv

def get_html(url):
    r = requests.get(url)
    return r.text

def get_total_pages(html):
    soup = BeautifulSoup(html, 'lxml')
    pages = soup.find('div', class_ = 'pagination js-pages').find_all('a', class_ = 'pagination-page')[-1].get('href')
    total_pages = pages.split('=')[1].split('&')[0]
    return int(total_pages)

def write_csv(data):
    with open('/home/vlad_pc/Desktop/TRAINNING/avito_data_parsing/avita_filtered2.csv', 'a', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow((data['title'],
                         data['price'],
                         data['metro'],
                         data['url']))

def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
    ads = soup.find('div', class_ = 'catalog-list').find_all('div', class_ = 'item_table')
    for ad in ads:
#sozdaem cikl sravnenij
        name = ad.find('div', class_ = 'description').find('h3').find('a').text.strip().lower()#niznii registr
        #esli estj imja teleefona
        if 'htc' in name:

            try:
                title = ad.find('div', class_ = 'description').find('h3').find('a').text.strip()
            except:
                title = ''
            try:
                url = 'https://www.avito.ru' + ad.find('div', class_ = 'description').find('h3').find('a').get('href')
            except:
                url = ''
            try:
                price = ad.find('div', class_ = 'about').text.strip()
            except:
                price = ''
            try:
                metro = ad.find('div', class_ = 'data').find_all('p')[-1].text.strip()
            except:
                metro = ''

            data = {'title': title,
                   'price': price,
                   'metro': metro,
                   'url': url}
            write_csv(data)
        else:
            continue#esli net to cikl idet daljwe

def main():
    url = 'https://www.avito.ru/moskva/telefony?p=1&q=htc'
    base_url = 'https://www.avito.ru/moskva/telefony?'
    page_part = 'p='
    query_part = '&q=htc'
    total_pages = get_total_pages(get_html(url))
    for i in range(4):
        url_gen = base_url + page_part + str(i) + query_part
        html = get_html(url_gen)
    get_page_data(html)

if __name__ == "__main__":
    main()