titleCrawler/main.py at master · Changjinxing/titleCrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# coding: utf-8
import threading
import Queue
from spider import Spider
from general import *
import sqlite3

# Choose the Project name
WORK_PATH = './'
DB_PATH = './'

QUEUE_FILE_NAME = 'top-1m.csv'
DB_FILE_NAME = 'test.db'

QUEUE_FILE_PATH = WORK_PATH + QUEUE_FILE_NAME

NUMBER_OF_THREADS = 8

DB_FILE_PATH = DB_PATH + DB_FILE_NAME

queue_links = file_to_arr(QUEUE_FILE_PATH)
queue = Queue.Queue()
Spider(queue_links)

# Create worker threads (will die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()

# Do the next job in the queue
def work():
    while True:
        url = queue.get()
        table_name = 'url_title_rel'
        title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name)
        #print title
        queue.task_done()

# Each queued link is a new job
def create_jobs():
    for link in queue_links:
        queue.put(link)
    queue.join()

# Check if there are items in the queue, if so crawl them
def crawl():
    queued_links = queue_links
    if len(queued_links) > 0:
        create_jobs()

# connect the sqlite db
sqlite_db = sqlite3.connect(DB_FILE_PATH)
sqlite_cu = sqlite_db.cursor()
table_name = 'url_title_rel'
sqlite_cu.execute('create table url_title_rel (id integer primary key,url_tag varchar(50),title_name varchar(100) UNIQUE)')

from time import clock
start_time = clock()
#print start_time
create_workers()
crawl()
finish_time = clock()
print finish_time
#print(finish_time - start_time)

#sqlite_cu.execute("select * from url_title_rel")
#print sqlite_cu.fetchall()