-
Notifications
You must be signed in to change notification settings - Fork 8
WIP: New hashing schema #59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
0f5e044
1f92656
1596b37
ed84cec
b3601cb
b812795
0f1a969
3797c8f
97f3e37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,8 @@ | ||
| import os | ||
| import tempfile | ||
| import time | ||
|
|
||
| import xxhash | ||
| import aiohttp | ||
| import validators | ||
|
|
||
|
|
@@ -9,14 +11,15 @@ | |
|
|
||
| from csvapi.errors import APIError | ||
| from csvapi.parser import parse | ||
| from csvapi.utils import already_exists, get_hash | ||
| from csvapi.utils import is_hash_relevant, get_hash, age_valid | ||
|
|
||
| X = xxhash.xxh64() | ||
|
|
||
| class ParseView(MethodView): | ||
|
|
||
| @staticmethod | ||
| async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_file_size): | ||
| logger.debug('* do_parse %s (%s)', urlhash, url) | ||
| logger.debug('* do_parse (%s)', url) | ||
| tmp = tempfile.NamedTemporaryFile(delete=False) | ||
| chunk_count = 0 | ||
| chunk_size = 1024 | ||
|
|
@@ -30,13 +33,23 @@ async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_fil | |
| raise Exception('File too big (max size is %s bytes)' % max_file_size) | ||
| if not chunk: | ||
| break | ||
| X.update(chunk) | ||
| tmp.write(chunk) | ||
| chunk_count += 1 | ||
| tmp.close() | ||
| logger.debug('* Downloaded %s', urlhash) | ||
| logger.debug('* Parsing %s...', urlhash) | ||
| parse(tmp.name, urlhash, storage, encoding=encoding, sniff_limit=sniff_limit) | ||
| logger.debug('* Parsed %s', urlhash) | ||
| filehash = X.hexdigest() | ||
| logger.debug('* Downloaded %s', filehash) | ||
| if not is_hash_relevant(urlhash, filehash): | ||
| print("HASH IS NOT RELEVANT") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove |
||
| try: | ||
| logger.debug('* Parsing %s...', filehash) | ||
| parse(tmp.name, urlhash, filehash, storage, encoding=encoding, sniff_limit=sniff_limit) | ||
| logger.debug('* Parsed %s', filehash) | ||
| except Exception as e: | ||
| raise APIError('Error parsing CSV: %s' % e) | ||
| else: | ||
| print("HASH IS RELEVANT") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove |
||
| logger.info(f"File hash for {urlhash} is relevant, skipping parse.") | ||
| finally: | ||
| logger.debug('Removing tmp file: %s', tmp.name) | ||
| os.unlink(tmp.name) | ||
|
|
@@ -50,22 +63,25 @@ async def get(self): | |
| if not validators.url(url): | ||
| raise APIError('Malformed url parameter.', status=400) | ||
| urlhash = get_hash(url) | ||
| storage = app.config['DB_ROOT_DIR'] | ||
|
|
||
| if not already_exists(urlhash): | ||
| if not age_valid(storage, urlhash): | ||
| print("AGE IS NOT OK") | ||
| try: | ||
| storage = app.config['DB_ROOT_DIR'] | ||
| await self.do_parse(url=url, | ||
| urlhash=urlhash, | ||
| encoding=encoding, | ||
| storage=storage, | ||
| logger=app.logger, | ||
| sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), | ||
| max_file_size=app.config.get('MAX_FILE_SIZE') | ||
| ) | ||
| await self.do_parse( | ||
| url=url, | ||
| urlhash=urlhash, | ||
| encoding=encoding, | ||
| storage=storage, | ||
| logger=app.logger, | ||
| sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), | ||
| max_file_size=app.config.get('MAX_FILE_SIZE') | ||
| ) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indentation |
||
| except Exception as e: | ||
| raise APIError('Error parsing CSV: %s' % e) | ||
| else: | ||
| app.logger.info(f"{urlhash}.db already exists, skipping parse.") | ||
| print("AGE IS OK") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove |
||
| app.logger.info(f"Db for {urlhash} is young enough, serving as is.") | ||
| scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme | ||
| return jsonify({ | ||
| 'ok': True, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,8 @@ | ||
| import hashlib | ||
| import os | ||
| import datetime | ||
|
|
||
| import sqlite3 | ||
| import xxhash | ||
|
|
||
| from concurrent import futures | ||
| from pathlib import Path | ||
|
|
@@ -8,15 +12,78 @@ | |
| executor = None | ||
|
|
||
|
|
||
| def get_db_info(urlhash, storage=None): | ||
| # app.config not thread safe, sometimes we need to pass storage directly | ||
| def create_sys_db(app, storage=None): | ||
| # We do not use rhe get_sys_db_info here because the call is made outside of the app context. | ||
| storage = storage or app.config['DB_ROOT_DIR'] | ||
| dbpath = f"{storage}/{urlhash}.db" | ||
| dbpath = f"{storage}/sys.db" | ||
|
|
||
| conn = sqlite3.connect(dbpath) | ||
| with conn: | ||
| conn.execute("CREATE TABLE IF NOT EXISTS csvapi_sys (id integer primary key, db_uuid text, urlhash text, filehash text, creation_time date)") | ||
| conn.close() | ||
|
|
||
|
|
||
| def get_sys_db_info(): | ||
| storage = app.config['DB_ROOT_DIR'] | ||
| dbpath = f"{storage}/sys.db" | ||
| return { | ||
| 'dsn': f"sqlite:///{dbpath}", | ||
| 'db_name': urlhash, | ||
| 'db_name': "sys.db", | ||
| 'table_name': "csvapi_sys", | ||
| 'db_path': dbpath, | ||
| } | ||
|
|
||
| def add_entry_to_sys_db(uuid, urlhash, filehash): | ||
| now = datetime.datetime.now() | ||
| now_str = now.strftime('%Y-%m-%d') | ||
|
|
||
| sys_db = get_sys_db_info() | ||
| conn = sqlite3.connect(sys_db['db_path']) | ||
| with conn: | ||
| conn.execute("INSERT INTO csvapi_sys (db_uuid, urlhash, filehash, creation_time) values (?, ?, ?, ?)", (uuid, urlhash, filehash, now_str)) | ||
| conn.close() | ||
|
|
||
|
|
||
|
|
||
| def get_db_info(urlhash=None, filehash=None, storage=None): | ||
| storage = storage or app.config['DB_ROOT_DIR'] | ||
|
|
||
| sys_db = get_sys_db_info() | ||
| conn = sqlite3.connect(sys_db['db_path']) | ||
| c = conn.cursor() | ||
|
|
||
| # The function permits to seek by urlhash and filehash because of the uploadview. | ||
| # Do we want to keep things this way? | ||
|
|
||
| if urlhash is not None: | ||
| c.execute('SELECT * FROM csvapi_sys WHERE urlhash=?', (urlhash,)) | ||
| elif filehash is not None: | ||
| c.execute('SELECT * FROM csvapi_sys WHERE filehash=?', (filehash,)) | ||
| else: | ||
| raise RuntimeError('Func get_db_info need at least one not none argument') | ||
|
|
||
| res = c.fetchone() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code below can probably be made cleaner/shorter by getting column values in a dict instead of a list eg |
||
| if not res: | ||
| return None | ||
|
|
||
| dbuuid = res[1] | ||
| urlhash = res[2] | ||
| filehash = res[3] | ||
| creadate = res[4] | ||
| dbpath = f"{storage}/{dbuuid}.db" | ||
| dbname = dbuuid | ||
|
|
||
| conn.close() | ||
| return { | ||
| 'db_uuid': dbuuid, | ||
| 'urlhash': urlhash, | ||
| 'filehash': filehash, | ||
| 'creation_date': creadate, | ||
| 'table_name': urlhash, | ||
| 'db_path': dbpath, | ||
| 'db_name': dbname, | ||
| 'dsn': f"sqlite:///{dbpath}" | ||
|
|
||
| } | ||
|
|
||
|
|
||
|
|
@@ -34,11 +101,53 @@ def get_hash(to_hash): | |
|
|
||
|
|
||
| def get_hash_bytes(to_hash): | ||
| return hashlib.md5(to_hash).hexdigest() | ||
| return xxhash.xxh64(to_hash).hexdigest() | ||
|
|
||
|
|
||
| def already_exists(urlhash): | ||
| def already_exists(filehash): | ||
| cache_enabled = app.config.get('CSV_CACHE_ENABLED') | ||
| if not cache_enabled: | ||
| return False | ||
| return Path(get_db_info(urlhash)['db_path']).exists() | ||
|
|
||
| db = get_db_info(filehash=filehash) | ||
| if db is None: | ||
| return False | ||
|
|
||
| return True | ||
|
|
||
|
|
||
| def is_hash_relevant(urlhash, filehash): | ||
| cache_enabled = app.config.get('CSV_CACHE_ENABLED') | ||
| if not cache_enabled: | ||
| return False | ||
|
|
||
| db = get_db_info(urlhash=urlhash) | ||
| if db is None: | ||
| return False | ||
|
|
||
| # Question here is to wether or not to seek by urlhash or directly by filehash. | ||
| # Seeking by filehash would save the hash comparison but are we sure we are getting the right entry for the urlhash we wanted? | ||
| # The answer is yes if there can't be more than one entry by urlhash. | ||
| if db['filehash'] == filehash: | ||
| return True | ||
|
|
||
| return False | ||
|
|
||
|
|
||
| def age_valid(storage, urlhash): | ||
| max_age = app.config['CACHE_MAX_AGE'] | ||
|
|
||
| db = get_db_info(urlhash=urlhash) | ||
| if db is None: | ||
| return False | ||
|
|
||
| date_time_obj = datetime.datetime.strptime(db['creation_date'], '%Y-%m-%d') | ||
| later_time = datetime.datetime.now() | ||
| file_age = later_time - date_time_obj | ||
| if file_age.days > max_age: | ||
| return False | ||
|
|
||
| return True | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,3 +10,4 @@ quart-cors==0.2.0 | |
| raven==6.10.0 | ||
| cchardet==2.1.4 | ||
| python-stdnum==1.11 | ||
| xxhash==1.4.3 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing line at EOF