basicCrawler/test_crawl.py at main · mmbc2008/basicCrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import unittest
import requests
from crawl import normalize_url
from crawl import get_h1_from_html, get_first_paragraph_from_html, get_urls_from_html, get_images_from_html, extract_page_data_basic

class TestCrawl(unittest.TestCase):
    def test_normalize_secure_url(self):
        input_url = "https://www.eventbrite.nl"
        actual = normalize_url(input_url)
        expected = "www.eventbrite.nl"
        self.assertEqual(actual, expected)

    def test_normalize_unsecure_url(self):
        input_url = "http://www.eventbrite.nl"
        actual = normalize_url(input_url)
        expected = "www.eventbrite.nl"
        self.assertEqual(actual, expected)

    def test_normalize_capitalized_url(self):
        input_url = "HTTP://WWW.EVENTBRITE.NL"
        actual = normalize_url(input_url)
        expected = "www.eventbrite.nl"
        self.assertEqual(actual, expected)

    def test_normalize_url_with_slash(self):
        input_url = "https://www.eventbrite.nl/"
        actual = normalize_url(input_url)
        expected = "www.eventbrite.nl"
        self.assertEqual(actual, expected)

    def test_get_h1_from_html(self):
        input_html = "<html><body><h1>Test Heading</h1></body></html>"
        actual = get_h1_from_html(input_html)
        expected = "Test Heading"
        self.assertEqual(actual, expected)

    def test_missing_h1_from_html(self):
        input_html = "<html><body><div>Random Text</div></body></html>"
        actual = get_h1_from_html(input_html)
        expected = ""
        self.assertEqual(actual, expected)

    def test_get_empty_h1_from_html(self):
        input_html = "<html><body><h1></h1></body></html>"
        actual = get_h1_from_html(input_html)
        expected = ""
        self.assertEqual(actual, expected)

    def test_get_p_from_html(self):
        input_html = "<html><body><p>Test Paragraph</p></body></html>"
        actual = get_first_paragraph_from_html(input_html)
        expected = "Test Paragraph"
        self.assertEqual(actual, expected)

    def test_get_missing_p_from_html(self):
        input_html = "<html><body><div>Random Text<div></body></html>"
        actual = get_first_paragraph_from_html(input_html)
        expected = ""
        self.assertEqual(actual, expected)

    def test_get_empty_p_from_html(self):
        input_html = "<html><body><p></p></body></html>"
        actual = get_first_paragraph_from_html(input_html)
        expected = ""
        self.assertEqual(actual, expected)

    def test_get_urls_from_html(self):
        input_url = "https://www.eventbrite.nl"
        input_body = '<html><body><a href="https://www.eventbrite.nl"><span>Boot.dev</span></a></body></html>"'
        actual = get_urls_from_html(input_body, input_url)
        expected = ["https://www.eventbrite.nl"]
        self.assertEqual(actual, expected)

    def test_missing_url_in_html_(self):
        input_url = 'https://www.eventbrite.nl'
        input_body = '<html><body><a title="This link has no destination">Click Here?</span></a></body></html>'
        actual = get_urls_from_html(input_body, input_url)
        expected = []
        self.assertEqual(actual, expected)

    def test_get_multiple_urls_from_html(self):
        input_url = 'https://www.eventbrite.nl'
        input_body ="""
                            <html>
                                <body>
                                    <a href="/first_page.html">
                                    <a href="https://eventbrite.nl/second_page.html">
                                    <a href="third_page.html">
                                </body>
                            </html>
                            """
        actual = get_urls_from_html(input_body, input_url)
        expected = ['https://www.eventbrite.nl/first_page.html', 'https://eventbrite.nl/second_page.html',
                    'https://www.eventbrite.nl/third_page.html']

        self.assertEqual(actual, expected)

    def test_get_images_from_html(self):
        input_url = "https://www.eventbrite.nl"
        input_body = '<html><body><img src="https://www.eventbrite.nl/image.jpg"></body></html>"'
        actual = get_images_from_html(input_body, input_url)
        expected = ["https://www.eventbrite.nl/image.jpg"]
        self.assertEqual(actual, expected)

    def test_get_images_from_html_no_url(self):
        input_url = 'https://www.eventbrite.nl'
        input_body = '<html><body><img alt="/image.jpg"></body></html>'
        actual = get_images_from_html(input_body, input_url)
        expected = []
        self.assertEqual(actual, expected)

    def test_get_multiple_images_from_html(self):
        input_url = 'https://www.eventbrite.nl'
        input_body = """
                            <html>
                                <body>
                                    <img src="/first_image.png">
                                    <img src="https://eventbrite.nl/second_image.jpg">
                                    <img src="third_image.gif">
                                </body>
                            </html>
                            """
        actual = get_images_from_html(input_body, input_url)
        expected = ['https://www.eventbrite.nl/first_image.png', 'https://eventbrite.nl/second_image.jpg',
                    'https://www.eventbrite.nl/third_image.gif']
        self.assertEqual(actual, expected)

    def test_extract_page_data_basic(self):
        """
        1. You need an input url
        2. You need an input body example with the following tags
            - url
            - heading
            - first paragraph
            - outgoing links
            - image urls
        3. You need to call the extract page data basic function
        4. For expected create an example dictionary with the populated data from each html tag data
        5. Run an assert equals
        """
        input_url = "https://www.eventbrite.nl"
        input_body = '''<html><body>
        <h1>Test Title</h1>
        <p>This is the first paragraph.</p>
        <a href="/link1">Link 1</a>
        <img src="/image1.jpg" alt="Image 1">
    </body></html>'''

        actual = extract_page_data_basic(input_body, input_url)
        expected = {
            'url': 'https://www.eventbrite.nl',
            'heading': 'Test Title',
            'first_paragraph': 'This is the first paragraph.',
            'outgoing_links': ['https://www.eventbrite.nl/link1'],
            'image_urls': ['https://www.eventbrite.nl/image1.jpg']
        }

        return self.assertEqual(actual, expected)


if __name__ == "__main__":
    unittest.main()