-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathacquire.py
More file actions
63 lines (44 loc) · 2.03 KB
/
acquire.py
File metadata and controls
63 lines (44 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# imports
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import random
import os
def acquire_codeup_blog():
'''webscrapes from codeup blogs'''
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0",
"Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
]
user_agents = random.choice(user_agents)
headers = {'User-Agent': user_agents[0]}
base_url = 'https://codeup.edu/blog/'
response = requests.get(base_url, headers=headers)
base_soup = BeautifulSoup(response.text, 'html.parser')
blog_links = [element['href'] for element in base_soup.find_all('a', class_='more-link')]
blog_contents = []
for link in blog_links:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1', class_='entry-title').text
body = soup.find('div', class_='entry-content').text.strip()
row = {'title' : title, 'article': body}
blog_contents.append(row)
blog_arts = pd.DataFrame(blog_contents)
return blog_arts
def acquire_news_articles():
'''webscrapes from '''
base_url = 'https://inshorts.com/en/read/'
categories = ['business', 'sports', 'technology', 'entertainment']
all_articles = pd.DataFrame(columns=['title','body','category'])
for category in categories:
category_url = base_url + category
raw_content = requests.get(category_url).text
soup = BeautifulSoup(raw_content, 'html.parser')
title = [element.text for element in soup.find_all('span', itemprop='headline')]
bodies = [element.text for element in soup.find_all('div', itemprop='articleBody')]
category_df = pd.DataFrame({'title': title, 'body': bodies, 'category': category})
all_articles = pd.concat([all_articles, category_df], axis=0, ignore_index=True)
return all_articles