-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlibqa.py
More file actions
161 lines (126 loc) · 5.34 KB
/
Copy pathlibqa.py
File metadata and controls
161 lines (126 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import fnmatch
import logging
import os
import requests
import shutil
import sys
import time
"""
LibQA
Takes an arbitrary number of files that contain urls delimited by \n and tests them for status==200.
Any url that fails or returns something other than 200 is logged and collected into a single failed
file that can be re-tested.
new*.txt files are supplied by an external script
failed*.txt files are generated by this script when any url tested fails for any reason
script attempts to insulate itself from running over itself by creating new temp dir for each run
and moving all new* and failed* files into that dir for testing.
"""
logging.basicConfig(level = logging.INFO,
filename = 'log.txt',
format = '%(asctime)s %(message)s')
logger = logging
# should use ARGV to get this stuff
prependUrl = 'http://dspace.mit.edu/openaccess-disseminate/'
appendUrl = ''
def testLine(line):
"""
1. do a get on the line
2. if get fails, or status <> 200, log the error and return False
3. if get succeeds return True
"""
try:
r = requests.get(prependUrl + line + appendUrl)
if (r.status_code == 200):
return True
else:
logger.warning("status: '{0}'".format(r.status_code))
return False
except requests.exceptions.RequestException as e:
logger.error("REQUEST ERROR: {0}".format(e))
return False
def processFiles(pid):
"""
once there are files in a temp dir, this will:
1. build a list of all urls to test ('process list')
2. opens any existing success file
- if none exists, create a new one
- build an array of already succeeded urls from that file
3. opens a new failed file for writing, overwriting the old file
4. opens the success file for appending
5. iterates through process list
- skip if url exists in success list
- test the url
- if success, add url to success file
- if failed, add to failed file
6. move the failed file back to app root
7. clean up temp dir
"""
process_dirname = 'process_'+ pid
success_filename = process_dirname +'/'+ 'process_success_list.txt'
failed_filename = process_dirname +'/'+ 'process_failed_list.txt'
file_list = fnmatch.filter(os.listdir(process_dirname), 'new*.txt') + fnmatch.filter(os.listdir(process_dirname), 'failed*.txt')
if (len(file_list)):
try:
# maintain a list of already-tested links *for this run*
# note that this can be different than just the success_file list
# because we may simply have duplicate urls, and we don't want to bother
# re-testing them in any single run, whether or not they succeed
tested_list = []
if (os.path.isfile(success_filename)):
with open(success_filename, 'r') as success_file:
for line in success_file:
tested_list.append(line.strip())
with open(success_filename, 'a') as success_file:
with open(failed_filename, 'w') as failed_file:
for filename in file_list:
logger.info('processing file: '+ filename)
with open(process_dirname +'/'+ filename, 'r') as processing_file:
for line in processing_file:
line = line.strip()
logger.info("testing line '"+ line +"'")
if (line in tested_list):
logger.warning("duplicate line")
else:
if (testLine(line) == True):
success_file.write(line +"\n")
else:
failed_file.write(line +"\n")
tested_list.append(line)
if (os.path.getsize(failed_filename)):
shutil.move(failed_filename, './failed_'+ pid +'.txt')
shutil.rmtree(process_dirname)
except IOError as e:
logger.error(e)
def moveFiles(pid):
"""
when there are existing files in the app root dir
the app will move them to a temp dir for processing
this eliminates the chance that running this app twice
will cause problems
"""
process_dirname = 'process_'+ pid
file_list = fnmatch.filter(os.listdir('.'), 'new*.txt') + fnmatch.filter(os.listdir('.'), 'failed*.txt')
logger.info('found files: '+ ','.join(file_list))
if (len(file_list)):
try:
os.mkdir(process_dirname)
# move the files to process into a temp processing dir
for filename in file_list:
shutil.move(filename, process_dirname +'/'+ filename)
except IOError as e:
logger.error(e)
return(file_list)
def main():
logger.info('start '+ sys.argv[0])
# specified an existing run to re-try
if (len(sys.argv)>1):
pid = sys.argv[1]
processFiles(pid)
# kick off a fresh run
else:
pid = time.strftime("%Y%m%d_%H%M%S")
if (len(moveFiles(pid))):
processFiles(pid)
logger.info('end '+ sys.argv[0])
if __name__ == "__main__":
main()