-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
69 lines (56 loc) · 1.61 KB
/
main.py
File metadata and controls
69 lines (56 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# coding: utf-8
import threading
import Queue
from spider import Spider
from general import *
import sqlite3
# Choose the Project name
WORK_PATH = './'
DB_PATH = './'
QUEUE_FILE_NAME = 'top-1m.csv'
DB_FILE_NAME = 'test.db'
QUEUE_FILE_PATH = WORK_PATH + QUEUE_FILE_NAME
NUMBER_OF_THREADS = 8
DB_FILE_PATH = DB_PATH + DB_FILE_NAME
queue_links = file_to_arr(QUEUE_FILE_PATH)
queue = Queue.Queue()
Spider(queue_links)
# Create worker threads (will die when main exits)
def create_workers():
for _ in range(NUMBER_OF_THREADS):
t = threading.Thread(target=work)
t.daemon = True
t.start()
# Do the next job in the queue
def work():
while True:
url = queue.get()
table_name = 'url_title_rel'
title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name)
#print title
queue.task_done()
# Each queued link is a new job
def create_jobs():
for link in queue_links:
queue.put(link)
queue.join()
# Check if there are items in the queue, if so crawl them
def crawl():
queued_links = queue_links
if len(queued_links) > 0:
create_jobs()
# connect the sqlite db
sqlite_db = sqlite3.connect(DB_FILE_PATH)
sqlite_cu = sqlite_db.cursor()
table_name = 'url_title_rel'
sqlite_cu.execute('create table url_title_rel (id integer primary key,url_tag varchar(50),title_name varchar(100) UNIQUE)')
from time import clock
start_time = clock()
#print start_time
create_workers()
crawl()
finish_time = clock()
print finish_time
#print(finish_time - start_time)
#sqlite_cu.execute("select * from url_title_rel")
#print sqlite_cu.fetchall()