From 0f5e044cddc58417fa2bfde1be1a734e5b950124 Mon Sep 17 00:00:00 2001 From: quaxsze Date: Mon, 27 Jan 2020 16:40:01 +0100 Subject: [PATCH 1/7] new hasing lib --- csvapi/parseview.py | 78 ++++++++++++++++++++++++++++++++------------- csvapi/utils.py | 4 +-- 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/csvapi/parseview.py b/csvapi/parseview.py index aaf343b..0f9ddba 100644 --- a/csvapi/parseview.py +++ b/csvapi/parseview.py @@ -1,6 +1,8 @@ import os import tempfile +import time +import xxhash import aiohttp import validators @@ -11,6 +13,7 @@ from csvapi.parser import parse from csvapi.utils import already_exists, get_hash +X = xxhash.xxh64() class ParseView(MethodView): @@ -18,12 +21,14 @@ async def options(self): pass @staticmethod - async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_file_size): - logger.debug('* do_parse %s (%s)', urlhash, url) + async def do_parse(url, encoding, storage, logger, sniff_limit, max_file_size): + logger.debug('* do_parse (%s)', url) tmp = tempfile.NamedTemporaryFile(delete=False) chunk_count = 0 chunk_size = 1024 + start_dl = time.time() try: + # TODO: Is it possible to know any change in the hash of a file without downloading it to check it? async with aiohttp.ClientSession() as session: async with session.get(url) as resp: while True: @@ -33,18 +38,34 @@ async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_fil raise Exception('File too big (max size is %s bytes)' % max_file_size) if not chunk: break + X.update(chunk) tmp.write(chunk) chunk_count += 1 tmp.close() - logger.debug('* Downloaded %s', urlhash) - logger.debug('* Parsing %s...', urlhash) - parse(tmp.name, urlhash, storage, encoding=encoding, sniff_limit=sniff_limit) - logger.debug('* Parsed %s', urlhash) + filehash = X.hexdigest() + print(filehash) + logger.debug('* Downloaded %s', filehash) + end_dl = time.time() + print(f"--------------------------------> download time: {end_dl - start_dl}<------------------------------------") + if not already_exists(filehash): + try: + start_parse = time.time() + logger.debug('* Parsing %s...', filehash) + parse(tmp.name, filehash, storage, encoding=encoding, sniff_limit=sniff_limit) + logger.debug('* Parsed %s', filehash) + end_parse = time.time() + print(f"--------------------------------> parse time: {end_parse - start_parse}<------------------------------------") + except Exception as e: + raise APIError('Error parsing CSV: %s' % e) + else: + logger.info(f"{filehash}.db already exists, skipping parse.") + return filehash finally: logger.debug('Removing tmp file: %s', tmp.name) os.unlink(tmp.name) async def get(self): + start = time.time() app.logger.debug('* Starting ParseView.get') url = request.args.get('url') encoding = request.args.get('encoding') @@ -52,25 +73,36 @@ async def get(self): raise APIError('Missing url query string variable.', status=400) if not validators.url(url): raise APIError('Malformed url parameter.', status=400) - urlhash = get_hash(url) + + storage = app.config['DB_ROOT_DIR'] + filehash = await self.do_parse(url=url, + encoding=encoding, + storage=storage, + logger=app.logger, + sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), + max_file_size=app.config.get('MAX_FILE_SIZE') + ) - if not already_exists(urlhash): - try: - storage = app.config['DB_ROOT_DIR'] - await self.do_parse(url=url, - urlhash=urlhash, - encoding=encoding, - storage=storage, - logger=app.logger, - sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), - max_file_size=app.config.get('MAX_FILE_SIZE') - ) - except Exception as e: - raise APIError('Error parsing CSV: %s' % e) - else: - app.logger.info(f"{urlhash}.db already exists, skipping parse.") + # if not already_exists(urlhash): + # try: + # storage = app.config['DB_ROOT_DIR'] + # await self.do_parse(url=url, + # urlhash=urlhash, + # encoding=encoding, + # storage=storage, + # logger=app.logger, + # sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), + # max_file_size=app.config.get('MAX_FILE_SIZE') + # ) + # except Exception as e: + # raise APIError('Error parsing CSV: %s' % e) + # else: + # app.logger.info(f"{urlhash}.db already exists, skipping parse.") scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme + end = time.time() + timer = end - start + print(f"--------------------------------> total execution time: {timer}<------------------------------------") return jsonify({ 'ok': True, - 'endpoint': f"{scheme}://{request.host}/api/{urlhash}" + 'endpoint': f"{scheme}://{request.host}/api/{filehash}" }) diff --git a/csvapi/utils.py b/csvapi/utils.py index 0fbd146..732f57c 100644 --- a/csvapi/utils.py +++ b/csvapi/utils.py @@ -33,8 +33,8 @@ def get_hash(to_hash): return hashlib.md5(to_hash.encode('utf-8')).hexdigest() -def already_exists(urlhash): +def already_exists(filehash): cache_enabled = app.config.get('CSV_CACHE_ENABLED') if not cache_enabled: return False - return Path(get_db_info(urlhash)['db_path']).exists() + return Path(get_db_info(filehash)['db_path']).exists() From 1596b370cfd5e32e06aee58e12c0ea0108312616 Mon Sep 17 00:00:00 2001 From: quaxsze Date: Mon, 27 Jan 2020 16:40:01 +0100 Subject: [PATCH 2/7] new hasing lib --- csvapi/parseview.py | 78 ++++++++++++++++++++++++++++++++------------- csvapi/utils.py | 4 +-- 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/csvapi/parseview.py b/csvapi/parseview.py index bcb533b..59f7bfe 100644 --- a/csvapi/parseview.py +++ b/csvapi/parseview.py @@ -1,6 +1,8 @@ import os import tempfile +import time +import xxhash import aiohttp import validators @@ -11,16 +13,19 @@ from csvapi.parser import parse from csvapi.utils import already_exists, get_hash +X = xxhash.xxh64() class ParseView(MethodView): @staticmethod - async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_file_size): - logger.debug('* do_parse %s (%s)', urlhash, url) + async def do_parse(url, encoding, storage, logger, sniff_limit, max_file_size): + logger.debug('* do_parse (%s)', url) tmp = tempfile.NamedTemporaryFile(delete=False) chunk_count = 0 chunk_size = 1024 + start_dl = time.time() try: + # TODO: Is it possible to know any change in the hash of a file without downloading it to check it? async with aiohttp.ClientSession() as session: async with session.get(url) as resp: while True: @@ -30,18 +35,34 @@ async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_fil raise Exception('File too big (max size is %s bytes)' % max_file_size) if not chunk: break + X.update(chunk) tmp.write(chunk) chunk_count += 1 tmp.close() - logger.debug('* Downloaded %s', urlhash) - logger.debug('* Parsing %s...', urlhash) - parse(tmp.name, urlhash, storage, encoding=encoding, sniff_limit=sniff_limit) - logger.debug('* Parsed %s', urlhash) + filehash = X.hexdigest() + print(filehash) + logger.debug('* Downloaded %s', filehash) + end_dl = time.time() + print(f"--------------------------------> download time: {end_dl - start_dl}<------------------------------------") + if not already_exists(filehash): + try: + start_parse = time.time() + logger.debug('* Parsing %s...', filehash) + parse(tmp.name, filehash, storage, encoding=encoding, sniff_limit=sniff_limit) + logger.debug('* Parsed %s', filehash) + end_parse = time.time() + print(f"--------------------------------> parse time: {end_parse - start_parse}<------------------------------------") + except Exception as e: + raise APIError('Error parsing CSV: %s' % e) + else: + logger.info(f"{filehash}.db already exists, skipping parse.") + return filehash finally: logger.debug('Removing tmp file: %s', tmp.name) os.unlink(tmp.name) async def get(self): + start = time.time() app.logger.debug('* Starting ParseView.get') url = request.args.get('url') encoding = request.args.get('encoding') @@ -49,25 +70,36 @@ async def get(self): raise APIError('Missing url query string variable.', status=400) if not validators.url(url): raise APIError('Malformed url parameter.', status=400) - urlhash = get_hash(url) + + storage = app.config['DB_ROOT_DIR'] + filehash = await self.do_parse(url=url, + encoding=encoding, + storage=storage, + logger=app.logger, + sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), + max_file_size=app.config.get('MAX_FILE_SIZE') + ) - if not already_exists(urlhash): - try: - storage = app.config['DB_ROOT_DIR'] - await self.do_parse(url=url, - urlhash=urlhash, - encoding=encoding, - storage=storage, - logger=app.logger, - sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), - max_file_size=app.config.get('MAX_FILE_SIZE') - ) - except Exception as e: - raise APIError('Error parsing CSV: %s' % e) - else: - app.logger.info(f"{urlhash}.db already exists, skipping parse.") + # if not already_exists(urlhash): + # try: + # storage = app.config['DB_ROOT_DIR'] + # await self.do_parse(url=url, + # urlhash=urlhash, + # encoding=encoding, + # storage=storage, + # logger=app.logger, + # sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), + # max_file_size=app.config.get('MAX_FILE_SIZE') + # ) + # except Exception as e: + # raise APIError('Error parsing CSV: %s' % e) + # else: + # app.logger.info(f"{urlhash}.db already exists, skipping parse.") scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme + end = time.time() + timer = end - start + print(f"--------------------------------> total execution time: {timer}<------------------------------------") return jsonify({ 'ok': True, - 'endpoint': f"{scheme}://{request.host}/api/{urlhash}" + 'endpoint': f"{scheme}://{request.host}/api/{filehash}" }) diff --git a/csvapi/utils.py b/csvapi/utils.py index f526cea..92b65dc 100644 --- a/csvapi/utils.py +++ b/csvapi/utils.py @@ -37,8 +37,8 @@ def get_hash_bytes(to_hash): return hashlib.md5(to_hash).hexdigest() -def already_exists(urlhash): +def already_exists(filehash): cache_enabled = app.config.get('CSV_CACHE_ENABLED') if not cache_enabled: return False - return Path(get_db_info(urlhash)['db_path']).exists() + return Path(get_db_info(filehash)['db_path']).exists() From ed84cec2958c5468aed9ef6218e81b107a23029c Mon Sep 17 00:00:00 2001 From: quaxsze Date: Thu, 30 Jan 2020 18:12:36 +0100 Subject: [PATCH 3/7] removed todo --- csvapi/parseview.py | 1 - 1 file changed, 1 deletion(-) diff --git a/csvapi/parseview.py b/csvapi/parseview.py index 59f7bfe..739f19d 100644 --- a/csvapi/parseview.py +++ b/csvapi/parseview.py @@ -25,7 +25,6 @@ async def do_parse(url, encoding, storage, logger, sniff_limit, max_file_size): chunk_size = 1024 start_dl = time.time() try: - # TODO: Is it possible to know any change in the hash of a file without downloading it to check it? async with aiohttp.ClientSession() as session: async with session.get(url) as resp: while True: From b8127956b2ba864e16448f57d0586912e9633dd0 Mon Sep 17 00:00:00 2001 From: quaxsze Date: Thu, 30 Jan 2020 18:48:26 +0100 Subject: [PATCH 4/7] change hash func in utils --- csvapi/parseview.py | 52 +++++++++++++++++++-------------------------- csvapi/utils.py | 5 +++-- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/csvapi/parseview.py b/csvapi/parseview.py index 59f7bfe..869ddea 100644 --- a/csvapi/parseview.py +++ b/csvapi/parseview.py @@ -18,14 +18,13 @@ class ParseView(MethodView): @staticmethod - async def do_parse(url, encoding, storage, logger, sniff_limit, max_file_size): + async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_file_size): logger.debug('* do_parse (%s)', url) tmp = tempfile.NamedTemporaryFile(delete=False) chunk_count = 0 chunk_size = 1024 start_dl = time.time() try: - # TODO: Is it possible to know any change in the hash of a file without downloading it to check it? async with aiohttp.ClientSession() as session: async with session.get(url) as resp: while True: @@ -62,7 +61,7 @@ async def do_parse(url, encoding, storage, logger, sniff_limit, max_file_size): os.unlink(tmp.name) async def get(self): - start = time.time() + # start = time.time() app.logger.debug('* Starting ParseView.get') url = request.args.get('url') encoding = request.args.get('encoding') @@ -70,35 +69,28 @@ async def get(self): raise APIError('Missing url query string variable.', status=400) if not validators.url(url): raise APIError('Malformed url parameter.', status=400) - - storage = app.config['DB_ROOT_DIR'] - filehash = await self.do_parse(url=url, - encoding=encoding, - storage=storage, - logger=app.logger, - sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), - max_file_size=app.config.get('MAX_FILE_SIZE') - ) + urlhash = get_hash(url) - # if not already_exists(urlhash): - # try: - # storage = app.config['DB_ROOT_DIR'] - # await self.do_parse(url=url, - # urlhash=urlhash, - # encoding=encoding, - # storage=storage, - # logger=app.logger, - # sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), - # max_file_size=app.config.get('MAX_FILE_SIZE') - # ) - # except Exception as e: - # raise APIError('Error parsing CSV: %s' % e) - # else: - # app.logger.info(f"{urlhash}.db already exists, skipping parse.") + if not already_exists(urlhash): + try: + storage = app.config['DB_ROOT_DIR'] + filehash = await self.do_parse( + url=url, + urlhash=urlhash, + encoding=encoding, + storage=storage, + logger=app.logger, + sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), + max_file_size=app.config.get('MAX_FILE_SIZE') + ) + except Exception as e: + raise APIError('Error parsing CSV: %s' % e) + else: + app.logger.info(f"{urlhash}.db already exists, skipping parse.") scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme - end = time.time() - timer = end - start - print(f"--------------------------------> total execution time: {timer}<------------------------------------") + # end = time.time() + # timer = end - start + # print(f"--------------------------------> total execution time: {timer}<------------------------------------") return jsonify({ 'ok': True, 'endpoint': f"{scheme}://{request.host}/api/{filehash}" diff --git a/csvapi/utils.py b/csvapi/utils.py index 92b65dc..d696bd1 100644 --- a/csvapi/utils.py +++ b/csvapi/utils.py @@ -1,4 +1,5 @@ -import hashlib +import os +import xxhash from concurrent import futures from pathlib import Path @@ -34,7 +35,7 @@ def get_hash(to_hash): def get_hash_bytes(to_hash): - return hashlib.md5(to_hash).hexdigest() + return xxhash.xxh64(to_hash).hexdigest() def already_exists(filehash): From 0f1a969ca1f00940de02d7bc1dae29a6a1db0626 Mon Sep 17 00:00:00 2001 From: quaxsze Date: Fri, 31 Jan 2020 10:24:11 +0100 Subject: [PATCH 5/7] added max age --- csvapi/cli.py | 5 ++++- csvapi/parseview.py | 6 +++--- csvapi/utils.py | 25 ++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/csvapi/cli.py b/csvapi/cli.py index 6d5675c..674f160 100644 --- a/csvapi/cli.py +++ b/csvapi/cli.py @@ -29,6 +29,8 @@ def cli(): help='Automatically reload if code change detected') @click.option('--cache/--no-cache', default=True, help='Do not parse CSV again if DB already exists') +@click.option('--cache-max-age', default=7, + help='Cache expiration time in days') @click.option('-w', '--max-workers', default=3, help='Max number of ThreadPoolExecutor workers') @click.option('--ssl-cert', default=None, @@ -36,7 +38,7 @@ def cli(): @click.option('--ssl-key', default=None, help='Path to SSL key') @cli.command() -def serve(dbs, host, port, debug, reload, cache, max_workers, ssl_cert, ssl_key): +def serve(dbs, host, port, debug, reload, cache, cache_max_age, max_workers, ssl_cert, ssl_key): ssl_context = None if ssl_cert and ssl_key: ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) @@ -44,6 +46,7 @@ def serve(dbs, host, port, debug, reload, cache, max_workers, ssl_cert, ssl_key) app.config.update({ 'DB_ROOT_DIR': dbs, 'CSV_CACHE_ENABLED': cache, + 'CACHE_MAX_AGE': cache_max_age 'MAX_WORKERS': max_workers, 'DEBUG': debug, # TODO this probably does not exist in Quart diff --git a/csvapi/parseview.py b/csvapi/parseview.py index 869ddea..5b49dbd 100644 --- a/csvapi/parseview.py +++ b/csvapi/parseview.py @@ -11,7 +11,7 @@ from csvapi.errors import APIError from csvapi.parser import parse -from csvapi.utils import already_exists, get_hash +from csvapi.utils import already_exists, get_hash, max_age X = xxhash.xxh64() @@ -70,10 +70,10 @@ async def get(self): if not validators.url(url): raise APIError('Malformed url parameter.', status=400) urlhash = get_hash(url) + storage = app.config['DB_ROOT_DIR'] - if not already_exists(urlhash): + if not max_age(storage, urlhash): try: - storage = app.config['DB_ROOT_DIR'] filehash = await self.do_parse( url=url, urlhash=urlhash, diff --git a/csvapi/utils.py b/csvapi/utils.py index d696bd1..d951123 100644 --- a/csvapi/utils.py +++ b/csvapi/utils.py @@ -1,4 +1,6 @@ import os +import datetime + import xxhash from concurrent import futures @@ -12,7 +14,10 @@ def get_db_info(urlhash, storage=None): # app.config not thread safe, sometimes we need to pass storage directly storage = storage or app.config['DB_ROOT_DIR'] - dbpath = f"{storage}/{urlhash}.db" + for f in os.listdir(storage): + if f.startswith(urlhash): + dbpath = f"{storage}/{f}" + break return { 'dsn': f"sqlite:///{dbpath}", 'db_name': urlhash, @@ -43,3 +48,21 @@ def already_exists(filehash): if not cache_enabled: return False return Path(get_db_info(filehash)['db_path']).exists() + + +def max_age(storage, urlhash): + max_age = app.config['CACHE_MAX_AGE'] + for f in os.listdir(storage): + if f.startswith(urlhash): + mod_time = os.stat(f).st_mtime + mod_timestamp = datetime.datetime.fromtimestamp(mod_time) + later_time = datetime.datetime.now() + file_age = later_time - mod_timestamp + if file_age.days > max_age: + return True + else: + break + return False + + + From 3797c8ff5524719cc661db32a4efb36b6f231621 Mon Sep 17 00:00:00 2001 From: quaxsze Date: Tue, 4 Feb 2020 16:33:50 +0100 Subject: [PATCH 6/7] refactoring hash --- csvapi/cli.py | 2 +- csvapi/parser.py | 15 +++--- csvapi/parseview.py | 32 +++++------- csvapi/utils.py | 107 +++++++++++++++++++++++++++++++-------- csvapi/webservice.py | 2 + requirements/install.pip | 1 + 6 files changed, 111 insertions(+), 48 deletions(-) diff --git a/csvapi/cli.py b/csvapi/cli.py index 674f160..9ceec4a 100644 --- a/csvapi/cli.py +++ b/csvapi/cli.py @@ -46,7 +46,7 @@ def serve(dbs, host, port, debug, reload, cache, cache_max_age, max_workers, ssl app.config.update({ 'DB_ROOT_DIR': dbs, 'CSV_CACHE_ENABLED': cache, - 'CACHE_MAX_AGE': cache_max_age + 'CACHE_MAX_AGE': cache_max_age, 'MAX_WORKERS': max_workers, 'DEBUG': debug, # TODO this probably does not exist in Quart diff --git a/csvapi/parser.py b/csvapi/parser.py index 467a85d..ca83a5d 100644 --- a/csvapi/parser.py +++ b/csvapi/parser.py @@ -1,10 +1,11 @@ import os +import uuid import agate import agatesql # noqa import cchardet as chardet -from csvapi.utils import get_db_info +from csvapi.utils import get_db_info, add_entry_to_sys_db from csvapi.type_tester import agate_tester SNIFF_LIMIT = 4096 @@ -33,15 +34,17 @@ def from_excel(filepath): return agate.Table.from_xls(filepath, column_types=agate_tester()) -def to_sql(table, urlhash, storage): - db_info = get_db_info(urlhash, storage=storage) - table.to_sql(db_info['dsn'], db_info['db_name'], overwrite=True) +def to_sql(table, urlhash, filehash, storage): + db_name = str(uuid.uuid4()) + db_dsn = f"sqlite:///{storage}/{db_name}.db" + table.to_sql(db_dsn, db_name, overwrite=True) + add_entry_to_sys_db(db_name, urlhash, filehash) -def parse(filepath, urlhash, storage, encoding=None, sniff_limit=SNIFF_LIMIT): +def parse(filepath, urlhash, filehash, storage, encoding=None, sniff_limit=SNIFF_LIMIT): if is_binary(filepath): table = from_excel(filepath) else: encoding = detect_encoding(filepath) if not encoding else encoding table = from_csv(filepath, encoding=encoding, sniff_limit=sniff_limit) - return to_sql(table, urlhash, storage) + return to_sql(table, urlhash, filehash, storage) \ No newline at end of file diff --git a/csvapi/parseview.py b/csvapi/parseview.py index 5b49dbd..64311a4 100644 --- a/csvapi/parseview.py +++ b/csvapi/parseview.py @@ -11,7 +11,7 @@ from csvapi.errors import APIError from csvapi.parser import parse -from csvapi.utils import already_exists, get_hash, max_age +from csvapi.utils import is_hash_relevant, get_hash, age_valid X = xxhash.xxh64() @@ -23,7 +23,6 @@ async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_fil tmp = tempfile.NamedTemporaryFile(delete=False) chunk_count = 0 chunk_size = 1024 - start_dl = time.time() try: async with aiohttp.ClientSession() as session: async with session.get(url) as resp: @@ -39,29 +38,23 @@ async def do_parse(url, urlhash, encoding, storage, logger, sniff_limit, max_fil chunk_count += 1 tmp.close() filehash = X.hexdigest() - print(filehash) logger.debug('* Downloaded %s', filehash) - end_dl = time.time() - print(f"--------------------------------> download time: {end_dl - start_dl}<------------------------------------") - if not already_exists(filehash): + if not is_hash_relevant(urlhash, filehash): + print("HASH IS NOT RELEVANT") try: - start_parse = time.time() logger.debug('* Parsing %s...', filehash) - parse(tmp.name, filehash, storage, encoding=encoding, sniff_limit=sniff_limit) + parse(tmp.name, urlhash, filehash, storage, encoding=encoding, sniff_limit=sniff_limit) logger.debug('* Parsed %s', filehash) - end_parse = time.time() - print(f"--------------------------------> parse time: {end_parse - start_parse}<------------------------------------") except Exception as e: raise APIError('Error parsing CSV: %s' % e) else: - logger.info(f"{filehash}.db already exists, skipping parse.") - return filehash + print("HASH IS RELEVANT") + logger.info(f"File hash for {urlhash} is relevant, skipping parse.") finally: logger.debug('Removing tmp file: %s', tmp.name) os.unlink(tmp.name) async def get(self): - # start = time.time() app.logger.debug('* Starting ParseView.get') url = request.args.get('url') encoding = request.args.get('encoding') @@ -72,9 +65,10 @@ async def get(self): urlhash = get_hash(url) storage = app.config['DB_ROOT_DIR'] - if not max_age(storage, urlhash): + if not age_valid(storage, urlhash): + print("AGE IS NOT OK") try: - filehash = await self.do_parse( + await self.do_parse( url=url, urlhash=urlhash, encoding=encoding, @@ -86,12 +80,10 @@ async def get(self): except Exception as e: raise APIError('Error parsing CSV: %s' % e) else: - app.logger.info(f"{urlhash}.db already exists, skipping parse.") + print("AGE IS OK") + app.logger.info(f"Db for {urlhash} is young enough, serving as is.") scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme - # end = time.time() - # timer = end - start - # print(f"--------------------------------> total execution time: {timer}<------------------------------------") return jsonify({ 'ok': True, - 'endpoint': f"{scheme}://{request.host}/api/{filehash}" + 'endpoint': f"{scheme}://{request.host}/api/{urlhash}" }) diff --git a/csvapi/utils.py b/csvapi/utils.py index d951123..e4f63ca 100644 --- a/csvapi/utils.py +++ b/csvapi/utils.py @@ -1,6 +1,7 @@ import os import datetime +import sqlite3 import xxhash from concurrent import futures @@ -11,18 +12,69 @@ executor = None -def get_db_info(urlhash, storage=None): - # app.config not thread safe, sometimes we need to pass storage directly +def create_sys_db(app, storage=None): + # We do not use rhe get_sys_db_info here because the call is made outside of the app context. storage = storage or app.config['DB_ROOT_DIR'] - for f in os.listdir(storage): - if f.startswith(urlhash): - dbpath = f"{storage}/{f}" - break + dbpath = f"{storage}/sys.db" + + conn = sqlite3.connect(dbpath) + with conn: + conn.execute("CREATE TABLE IF NOT EXISTS csvapi_sys (id integer primary key, db_uuid text, urlhash text, filehash text, creation_time date)") + conn.close() + + +def get_sys_db_info(): + storage = app.config['DB_ROOT_DIR'] + dbpath = f"{storage}/sys.db" return { 'dsn': f"sqlite:///{dbpath}", - 'db_name': urlhash, + 'db_name': "sys.db", + 'table_name': "csvapi_sys", + 'db_path': dbpath, + } + +def add_entry_to_sys_db(uuid, urlhash, filehash): + now = datetime.datetime.now() + now_str = now.strftime('%Y-%m-%d') + + sys_db = get_sys_db_info() + conn = sqlite3.connect(sys_db['db_path']) + with conn: + conn.execute("INSERT INTO csvapi_sys (db_uuid, urlhash, filehash, creation_time) values (?, ?, ?, ?)", (uuid, urlhash, filehash, now_str)) + conn.close() + + + +def get_db_info(urlhash, storage=None): + storage = storage or app.config['DB_ROOT_DIR'] + + sys_db = get_sys_db_info() + conn = sqlite3.connect(sys_db['db_path']) + c = conn.cursor() + t = (urlhash,) + c.execute('SELECT * FROM csvapi_sys WHERE urlhash=?', t) + res = c.fetchone() + if not res: + return None + + dbuuid = res[1] + urlhash = res[2] + filehash = res[3] + creadate = res[4] + dbpath = f"{storage}/{dbuuid}" + dbname = dbuuid + + conn.close() + return { + 'db_uuid': dbuuid, + 'urlhash': urlhash, + 'filehash': filehash, + 'creation_date': creadate, 'table_name': urlhash, 'db_path': dbpath, + 'db_name': dbname, + 'dsn': f"sqlite:///{dbpath}" + } @@ -43,26 +95,39 @@ def get_hash_bytes(to_hash): return xxhash.xxh64(to_hash).hexdigest() -def already_exists(filehash): +def already_exists(urlhash): + pass + + +def is_hash_relevant(urlhash, filehash): cache_enabled = app.config.get('CSV_CACHE_ENABLED') if not cache_enabled: return False - return Path(get_db_info(filehash)['db_path']).exists() + db = get_db_info(urlhash) + if db is None: + return False + + if db['filehash'] == filehash: + return True -def max_age(storage, urlhash): - max_age = app.config['CACHE_MAX_AGE'] - for f in os.listdir(storage): - if f.startswith(urlhash): - mod_time = os.stat(f).st_mtime - mod_timestamp = datetime.datetime.fromtimestamp(mod_time) - later_time = datetime.datetime.now() - file_age = later_time - mod_timestamp - if file_age.days > max_age: - return True - else: - break return False +def age_valid(storage, urlhash): + max_age = app.config['CACHE_MAX_AGE'] + + db = get_db_info(urlhash) + if db is None: + return False + + date_time_obj = datetime.datetime.strptime(db['creation_date'], '%Y-%m-%d') + later_time = datetime.datetime.now() + file_age = later_time - date_time_obj + if file_age.days > max_age: + return False + + return True + + diff --git a/csvapi/webservice.py b/csvapi/webservice.py index 510c0ad..9533a5b 100644 --- a/csvapi/webservice.py +++ b/csvapi/webservice.py @@ -11,6 +11,7 @@ from csvapi.uploadview import UploadView from csvapi.parseview import ParseView from csvapi.security import filter_referrers +from csvapi.utils import create_sys_db app = Quart(__name__) app = cors(app, allow_origin='*') @@ -29,6 +30,7 @@ from raven import Client app.extensions['sentry'] = Client(app.config['SENTRY_DSN']) +create_sys_db(app) def handle_and_print_error(): sentry_id = None diff --git a/requirements/install.pip b/requirements/install.pip index e2ebf03..72c891a 100644 --- a/requirements/install.pip +++ b/requirements/install.pip @@ -10,3 +10,4 @@ quart-cors==0.2.0 raven==6.10.0 cchardet==2.1.4 python-stdnum==1.11 +xxhash==1.4.3 From 97f3e37e63683abbbbbec9bcdde41b67440bd181 Mon Sep 17 00:00:00 2001 From: quaxsze Date: Tue, 4 Feb 2020 17:16:44 +0100 Subject: [PATCH 7/7] seek by urlhash or filehash --- csvapi/exportview.py | 2 +- csvapi/parser.py | 4 ++-- csvapi/tableview.py | 9 +++++---- csvapi/uploadview.py | 2 ++ csvapi/utils.py | 36 ++++++++++++++++++++++++++++-------- 5 files changed, 38 insertions(+), 15 deletions(-) diff --git a/csvapi/exportview.py b/csvapi/exportview.py index 8fbb7be..9488f64 100644 --- a/csvapi/exportview.py +++ b/csvapi/exportview.py @@ -15,7 +15,7 @@ class ExportView(TableView): async def get(self, urlhash): "This will inherit sorting and filtering from TableView" - db_info = get_db_info(urlhash) + db_info = get_db_info(urlhash=urlhash) p = Path(db_info['db_path']) if not p.exists(): raise APIError('Database has probably been removed.', status=404) diff --git a/csvapi/parser.py b/csvapi/parser.py index ca83a5d..28df041 100644 --- a/csvapi/parser.py +++ b/csvapi/parser.py @@ -5,7 +5,7 @@ import agatesql # noqa import cchardet as chardet -from csvapi.utils import get_db_info, add_entry_to_sys_db +from csvapi.utils import add_entry_to_sys_db from csvapi.type_tester import agate_tester SNIFF_LIMIT = 4096 @@ -37,7 +37,7 @@ def from_excel(filepath): def to_sql(table, urlhash, filehash, storage): db_name = str(uuid.uuid4()) db_dsn = f"sqlite:///{storage}/{db_name}.db" - table.to_sql(db_dsn, db_name, overwrite=True) + table.to_sql(db_dsn, urlhash, overwrite=True) add_entry_to_sys_db(db_name, urlhash, filehash) diff --git a/csvapi/tableview.py b/csvapi/tableview.py index cf29e54..1cd7437 100644 --- a/csvapi/tableview.py +++ b/csvapi/tableview.py @@ -146,10 +146,11 @@ async def data(self, db_info, export=False): return res async def get(self, urlhash): - db_info = get_db_info(urlhash) - p = Path(db_info['db_path']) - if not p.exists(): - raise APIError('Database has probably been removed.', status=404) + db_info = get_db_info(urlhash=urlhash) + if db_info is not None: + p = Path(db_info['db_path']) + if not p.exists(): + raise APIError('Database has probably been removed.', status=404) start = time.time() try: diff --git a/csvapi/uploadview.py b/csvapi/uploadview.py index 85bce83..7f3005f 100644 --- a/csvapi/uploadview.py +++ b/csvapi/uploadview.py @@ -26,6 +26,8 @@ async def post(self): _file.save(_tmpfile) _tmpfile.close() parse(_tmpfile.name, content_hash, storage, sniff_limit=sniff_limit) + # Here no urlhash, we need to change the parse code to make urlhash as possible None. + # But how will get_db_info in tableview work with such an entry in sys DB? finally: os.unlink(_tmpfile.name) diff --git a/csvapi/utils.py b/csvapi/utils.py index e4f63ca..accbe96 100644 --- a/csvapi/utils.py +++ b/csvapi/utils.py @@ -45,14 +45,23 @@ def add_entry_to_sys_db(uuid, urlhash, filehash): -def get_db_info(urlhash, storage=None): +def get_db_info(urlhash=None, filehash=None, storage=None): storage = storage or app.config['DB_ROOT_DIR'] sys_db = get_sys_db_info() conn = sqlite3.connect(sys_db['db_path']) c = conn.cursor() - t = (urlhash,) - c.execute('SELECT * FROM csvapi_sys WHERE urlhash=?', t) + + # The function permits to seek by urlhash and filehash because of the uploadview. + # Do we want to keep things this way? + + if urlhash is not None: + c.execute('SELECT * FROM csvapi_sys WHERE urlhash=?', (urlhash,)) + elif filehash is not None: + c.execute('SELECT * FROM csvapi_sys WHERE filehash=?', (filehash,)) + else: + raise RuntimeError('Func get_db_info need at least one not none argument') + res = c.fetchone() if not res: return None @@ -61,7 +70,7 @@ def get_db_info(urlhash, storage=None): urlhash = res[2] filehash = res[3] creadate = res[4] - dbpath = f"{storage}/{dbuuid}" + dbpath = f"{storage}/{dbuuid}.db" dbname = dbuuid conn.close() @@ -95,8 +104,16 @@ def get_hash_bytes(to_hash): return xxhash.xxh64(to_hash).hexdigest() -def already_exists(urlhash): - pass +def already_exists(filehash): + cache_enabled = app.config.get('CSV_CACHE_ENABLED') + if not cache_enabled: + return False + + db = get_db_info(filehash=filehash) + if db is None: + return False + + return True def is_hash_relevant(urlhash, filehash): @@ -104,10 +121,13 @@ def is_hash_relevant(urlhash, filehash): if not cache_enabled: return False - db = get_db_info(urlhash) + db = get_db_info(urlhash=urlhash) if db is None: return False + # Question here is to wether or not to seek by urlhash or directly by filehash. + # Seeking by filehash would save the hash comparison but are we sure we are getting the right entry for the urlhash we wanted? + # The answer is yes if there can't be more than one entry by urlhash. if db['filehash'] == filehash: return True @@ -117,7 +137,7 @@ def is_hash_relevant(urlhash, filehash): def age_valid(storage, urlhash): max_age = app.config['CACHE_MAX_AGE'] - db = get_db_info(urlhash) + db = get_db_info(urlhash=urlhash) if db is None: return False