python-scraper/bs4_getting_started.py at master · cmccarthy15/python-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

# BASICS #

# print(soup.prettify())
# print(soup.title)
# print(soup.title.name)
# print(soup.title.string)
# print(soup.title.parent.name)
# print(soup.p)
# print(soup.p['class'])
# print(soup.a)
# print(soup.find_all('a'))
# print(soup.find(id="link3"))
# for link in soup.find_all('a'):
#     print(link.get('href'))
# print(soup.get_text())


# TAGS #
# soup = BeautifulSoup('<b class="boldest" id="the-boldest">Extremely bold</b>', 'html.parser')
# tag = soup.b
# print(type(tag))

## TAG NAMES ##
# print(tag.name)

### changing tag names will reflect in HTML markup generated by bs4
# tag.name = "blockquote"
# print(tag)

## TAG ATTRIBUTES ##
### tag attributes are treated as a pyton dict
# print(tag['id'])

### you access the dict directly as such
# print(tag.attrs)

### you can add or remove as you would in a dictionary
# tag['id'] = 'verybold'
# tag['another-attribute'] = 1
# print(tag)

# del tag['another-attribute']
# print(tag)

### multi-valued attributes ( like class names ) are treated as lists
# css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
# print(css_soup.p['class'])


# NAVIGABLE STRINGS #
### corresponds to the text within a tag and is just like Python Unicode string, but also supports
### some of the features of navigating and searching the tree
# print(tag.string)
# type(tag.string)

### NavStrings can be converted to unicode using unicode(__NavString__)
### You can't edit the string in place, but you can replace it as such:
# tag.string.replace_with("No longer bold")
# print(tag)


# BEAUTIFULSOUP
### This object represents the document as a whole, but for most purposes can be treated
### as a Tag object
# print(soup.name)

# COMMENTS and SPECIAL STRINGS
### Comments are just a special type of NavigableString
# markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
# soup = BeautifulSoup(markup, 'html.parser')
# comment = soup.b.string
# print(comment)
# print(type(comment))

### When it appears as part of an HTML doc, it has special formatting
# print(soup.b.prettify())

### Other classes for things in an XML Doc: CData, ProcessingInstruction, Declaration, and Doctype


# NAVIGATING THE TREE

## GOING DOWN THE TREE
### The simplest way is to use the tag you want, but this will only give you the first relevant tag
# soup.head
# soup.title
# soup.body.b
# soup.a

# ### To get things more complicated than the first tag with a certain name, need methods for searching the tree
# soup.find_all('a')

# ### A tag's children are available in a list called .contents
# head_tag = soup.head
# head_tag.contents
# title_tag = head_tag.contents[0]

# ### the soup object itself has children, the first of which is the html tag
# len(soup.contents)
# soup.contents[0].name

# ### .children is a generator you can use to iterate over a tag's children
# for child in title_tag.children:
#   print(child)

# ### .descendants lets you iterate over all of a tag's children recursively
# len(list(soup.children))
# for child in soup.children:
#     print(child)
# len(list(soup.descendants))
# for child in soup.descendants:
#     print(child)


# ### .string is used if a tag has only one child, and the child is a NavString
# ### if the tag's only child is another tag and that tag has .string, it will have the same as its child
# ### if a tag contains more than one thing, .string will be None
# title_tag.string
# head_tag.string
# print(soup.html.string)


# ### if there is more than one thing in a tag, you can just use .string
# ### or use .stripped_strings if there is a lot of whitespace
# for string in soup.strings:
#     print(repr(string))

# for string in soup.stripped_strings:
#     print(repr(string))


## GOING UP THE TREE
### .parent attribute and .parents iterator
### string's parents are the tags that contain it, html parent is the bs4 object
### bs4 object parent is None
title_tag = soup.title
title_tag
title_tag.parent
title_tag.string.parent
html_tag = soup.html
type(html_tag.parent)
soup.parent #None

### remember, this only gets the first of that tag type
link = soup.a
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)


## GOING SIDEWAYS
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

### .next_sibling and .previous_sibling are none if they don't exist
sibling_soup.b.next_sibling
sibling_soup.c.previous_sibling

### contained strings are not siblings if their parent tags are siblings
print(sibling_soup.b.string.next_sibling)

### and the iterators, .next_siblings and .previous_siblings
### in our example soup, the next sibling of the first a tag is actually ',\n'
### because it's within that paragraph tag!
for sibling in soup.a.next_siblings:
    print(repr(sibling))
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))


### .previous_element and .next_element is the next/previous thing the parser encountered
last_a_tag = soup.find("a", id="link3")
last_a_tag
last_a_tag.next_element # Tille b/c the next thing after the a tag opening is the string Tillie

### and the iterators .previous_elements and .next_elements, you get the deal


# SEARCHING THE TREE
### two most popular methods are find() and find_all()
soup.find_all('b')

### using regular expressions is super helpful
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)
### the above finds all tags whose name strts with the letter b
### and the below containing the letter t
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

### you can also pass in a list or a boolean - True which returns all tags but not text strings
soup.find_all(["a", "b"])
for tag in soup.find_all(True):
    print(tag.name)

### or pass in a function
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)

### passing a function on a specific attribute
def not_lacie(href):
    return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)


### find_all looks through all descendants
### and you can pass different attributes, id, string, class_
### limit argument is self explanatory
### recursive argument = False will only look at direct descendants
soup.find_all("title")
# [<title>The Dormouse's story</title>]

soup.find_all("p", "title")
# [<p class="title"><b>The Dormouse's story</b></p>]

soup.find_all("a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find_all(id="link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

import re
soup.find(string=re.compile("sisters"))
# u'Once upon a time there were three little sisters; and their names were\n'


### calling a tag is like calling find_all(), the below are equivalent
soup.title.find_all(string=True)
soup.title(string=True)


# CSS SELECTORS
### use soup.select to use typical CSS selectors
soup.select("p:nth-of-type(3)")
### test for existence of an attribute
soup.select('a[href]')
### find tags by attribute value
soup.select('a[href="http://example.com/elsie"]')


# MODIFYING THE TREE


# OUTPUT
### print pretty - prettify() on soup object or any tag objects
print(soup.a.prettify())

### non-pretty printing
str(soup)
unicode(soup)

### formatters
print(soup.prettify(formatter="html")) # convert Unicode characters to HTML entities whenever possible
print(soup.prettify(formatter=None)) # will not modify strings at all on output, may lead to invalid HTML

### pass in a function for formatter,
### Beautiful Soup will call that function once for every string and attribute value in the document
def uppercase(str):
    return str.upper()

print(soup.prettify(formatter=uppercase))