-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathstrip_html.py
More file actions
40 lines (36 loc) · 2.89 KB
/
Copy pathstrip_html.py
File metadata and controls
40 lines (36 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from bs4 import BeautifulSoup
import urllib
import re, argparse, sys
import string
from nltk.classify import PositiveNaiveBayesClassifier
import nltk
def get_words_in_article(url):
'''Arguments : URL
Function: Gets the article only version of the URL using Instapaper.
Extracts the text in the artcile and removes any non AlphaNumeric characters in the text
Returns a list of words in the article present in the URL.'''
html_data = BeautifulSoup(urllib.urlopen(
"http://www.instapaper.com/m?%s" % urllib.urlencode({'u':url})).read()) #URLencoding the url to pass it to Instapaper
html_data = html_data.find("body") #Using only the contents in HTML <body> tag, avoides Javascript from being treated as text.
pattern = re.compile('[\W_ ]+') #Compile regex for alphanumeric characters and spaces(for multiword strings).
words = html_data.findAll(text=True) #setting text to True to extract only the text in the <body>
word_list = [] #Stores the list of words
for word in words[30:]: #Removing redundant content from Instapaper Mobilizer headers
for w in word.split(" "): #splitting on spcae for multiword strings
wd = (pattern.sub('',w.lower())) #substituing non alphanumeric characters with ''
if len(wd) > 1 : word_list.append(wd)#exclude strings of less than 2 characters
filtered_words = [w for w in word_list if not w in nltk.corpus.stopwords.words('english')]
return dict((word,True) for word in word_list)
if __name__ == '__main__':
print get_article_snippet("sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn",15)
parser = argparse.ArgumentParser(description = "Accepts a URL")
parser.add_argument("--url",dest = "url") #Extracts url from command line, if available
urls = parser.parse_args()
if urls.url == None:
print ("No URL Specified")
sys.exit()
positive_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/'])
misc_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/', 'http://www.engadget.com/2012/11/15/nexus-4-backordered/', 'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/', 'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/', 'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/', 'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/'])
classifier = PositiveNaiveBayesClassifier.train(positive_examples,misc_examples)
print classifier.classify(get_words_in_article(urls.url))
classifier.show_most_informative_features()