PublicFiles/Web_Scrapping-lxml.py at master · quantkeyvis/PublicFiles · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
import BeautifulSoup
import nltk
import re
import sys
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import pickle
import datetime
#import urllib.request
#from selenium import webdriver
import lxml


print 'starting'
link_types=['.wav','.wma','.m4a','.mpeg','.jpeg','.mpg','.mpa','.asf','.avi','.gif','.wmv','.aif','.mpa','mailto:','.mov','.mp3','.mp4' ,'instagram.com','twitter.com' ,'facebook.com' ,'.php','javascript:']
ref_notes=['#','#content','#main-content','stream.php']
sys.setrecursionlimit(100000)
pickle_loc=open("sites.pickle","rb")
sites=pickle.load(pickle_loc)
pickle_loc.close()
stamp=str(datetime.datetime.now()).split(" ")[0]
if os.path.isfile(r'articleslinkstitle'+stamp+'.pickle'):
    sys.exit()
test="https://www.theguardian.com/uk"
#print soup.prettify()
print 'start'
none_http_links=[]
art_links=[]
summary=[]
name_news=[]
arts=[]
errs=[]
country=[]
for i,state in enumerate(sites.keys()):
    for site in sites[state]:
        news_name=site.replace("https://www.", "")
        news_name=news_name.replace("http://www.", "")
        news_name=news_name.replace("https://", "")
        news_name=news_name.replace("http://", "")
        news_name=news_name.replace(".org", " ")
        news_name=news_name.replace(".com", " ")
        news_name=news_name.replace(".tv", " ")
        news_name=news_name.replace(".net", " ")
        news_name=news_name.replace(".co.", " ")
        news_name=news_name.replace(".", " ")
        news_name=news_name.split(' ')[0]
        if (r'http://' not in site and r'https://' not in site):
            site=r'http://'+site
        try:
            r = requests.get(site,timeout=10.0)
            print(news_name)
        except Exception as e:
            try:
                r = requests.get(site,timeout=10.0)
                print(news_name)
            except Exception as e:
                print(news_name+':  '+str(type(e)))
                print(site)
                print('\n')
                errs.append(str(type(e)))
                continue
        #driver = webdriver.PhantomJS(executable_path='PATH TO phantomjs')
        #driver.get(url)
        #r = driver.page_source
        soup = bs(r.content,"lxml")
        #soup = bs(r,"lxml")
        #### fix for dif webs
        for link in soup.find_all("a"):
            ref=link.get("href")
            if ref==None or sum([i in ref for i in link_types])>=1 or ref=='' or ref[0]=='#' or ref in ref_notes:# or link.get("data-link-name")!="article":
                continue
            else:
                if ref[0]=='/':
                    ref=site+ref
                elif ('http:' not in ref and 'www.' not in ref and 'https:' not in ref):
                    ref=site+'/'+ref
                try:
                    r2 = requests.get(ref,timeout=8.0)
                except Exception as e:
                    print(news_name+':  '+str(type(e)))
                    errs.append(str(type(e)))
                    continue
                soup2 = bs(r2.content,"lxml")
                article=""
                for words in soup2.find_all("p"):
                    article+= unicode(words.string).encode("utf-8")
                    #link.get("a")
                if len(article)<1284:
                    continue
                arts.append(article)
                country.append(state)
                art_links.append(site+ref)
                name_news.append(news_name)
                try:
                    summary.append(unicode(link.string).encode("utf-8"))
                except TypeError:
                    summary.append(unicode(link.span.string).encode("utf-8"))
    if i==2:
        print 'Completed one run'
#g_data = soup.find_all("div",{"class": "info"})
data={'site':name_news,'articles':arts,'Titles':summary,'Links':art_links, 'Time_Stamp':stamp,'Errors':errs,'Country':country}
pickle_loc=open(r'articleslinkstitle'+stamp+'.pickle',"wb")
sites=pickle.dump(data,pickle_loc)
pickle_loc.close()
print( 'Found '+str(len(data['articles']))+' articles')