From 3ce576d94d16a0b02e77e341f3d32dc220785513 Mon Sep 17 00:00:00 2001 From: timini Date: Wed, 20 Apr 2016 11:50:46 +0100 Subject: [PATCH 01/12] boilerplate for topics API --- api/api/comments/__init__.py | 0 api/api/comments/admin.py | 3 +++ api/api/comments/apps.py | 5 +++++ api/api/comments/migrations/__init__.py | 0 api/api/comments/models.py | 3 +++ api/api/comments/tests.py | 3 +++ api/api/comments/views.py | 3 +++ api/api/topics/__init__.py | 0 api/api/topics/admin.py | 3 +++ api/api/topics/apps.py | 5 +++++ api/api/topics/migrations/__init__.py | 0 api/api/topics/models.py | 3 +++ api/api/topics/tests.py | 3 +++ api/api/topics/views.py | 3 +++ 14 files changed, 34 insertions(+) create mode 100644 api/api/comments/__init__.py create mode 100644 api/api/comments/admin.py create mode 100644 api/api/comments/apps.py create mode 100644 api/api/comments/migrations/__init__.py create mode 100644 api/api/comments/models.py create mode 100644 api/api/comments/tests.py create mode 100644 api/api/comments/views.py create mode 100644 api/api/topics/__init__.py create mode 100644 api/api/topics/admin.py create mode 100644 api/api/topics/apps.py create mode 100644 api/api/topics/migrations/__init__.py create mode 100644 api/api/topics/models.py create mode 100644 api/api/topics/tests.py create mode 100644 api/api/topics/views.py diff --git a/api/api/comments/__init__.py b/api/api/comments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/comments/admin.py b/api/api/comments/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/api/api/comments/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/api/api/comments/apps.py b/api/api/comments/apps.py new file mode 100644 index 0000000..ff01b77 --- /dev/null +++ b/api/api/comments/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class CommentsConfig(AppConfig): + name = 'comments' diff --git a/api/api/comments/migrations/__init__.py b/api/api/comments/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/comments/models.py b/api/api/comments/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/api/api/comments/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/api/api/comments/tests.py b/api/api/comments/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/api/api/comments/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/api/api/comments/views.py b/api/api/comments/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/api/api/comments/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/api/api/topics/__init__.py b/api/api/topics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/topics/admin.py b/api/api/topics/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/api/api/topics/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/api/api/topics/apps.py b/api/api/topics/apps.py new file mode 100644 index 0000000..07e9bce --- /dev/null +++ b/api/api/topics/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class TopicsConfig(AppConfig): + name = 'topics' diff --git a/api/api/topics/migrations/__init__.py b/api/api/topics/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/topics/models.py b/api/api/topics/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/api/api/topics/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/api/api/topics/tests.py b/api/api/topics/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/api/api/topics/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/api/api/topics/views.py b/api/api/topics/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/api/api/topics/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. From 69703ef095442501973cb85f79ad888f17e1f580 Mon Sep 17 00:00:00 2001 From: timini Date: Wed, 20 Apr 2016 13:16:51 +0100 Subject: [PATCH 02/12] add postgres adapter and setings --- api/api/main/settings.py | 21 ++++++++++++++++----- api/requirements.txt | 5 +++++ docker-compose.yml | 11 ++++------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/api/api/main/settings.py b/api/api/main/settings.py index 930e502..c3d0e1b 100644 --- a/api/api/main/settings.py +++ b/api/api/main/settings.py @@ -82,12 +82,23 @@ # Database # https://docs.djangoproject.com/en/1.9/ref/settings/#databases -DATABASES = { - 'default': { - 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), +if os.environ.get('CONTAINERIZED'): + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'postgres', + 'USER': 'postgres', + 'HOST': 'db', + 'PORT': 5432, + } + } +else: + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } } -} # Password validation diff --git a/api/requirements.txt b/api/requirements.txt index f04a6af..419ca28 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -13,11 +13,16 @@ inflection==0.3.1 ipdb==0.9.0 ipython==4.1.2 ipython-genutils==0.1.0 +nltk==3.2 +numpy==1.11.0 oauthlib==1.0.3 path.py==8.1.2 pexpect==4.0.1 pickleshare==0.6 +psycopg2==2.6.1 ptyprocess==0.5.1 +requests==2.9.1 +sets==0.2.0 simplegeneric==0.8.1 six==1.10.0 traitlets==4.2.1 diff --git a/docker-compose.yml b/docker-compose.yml index 8cedb7e..d6ce531 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,11 +10,8 @@ api: links: - db - search -# depends_on: -# - db -# - search -client: - build: ./client - command: ember server -# depends_on: +# client: +# build: ./client +# command: ember server +# links: # - api From c3f70cc3b5d2a7069c683bfbc47e71b9d49863ae Mon Sep 17 00:00:00 2001 From: timini Date: Wed, 20 Apr 2016 15:23:11 +0100 Subject: [PATCH 03/12] node script to start docker-compose in development --- api/api/main/settings.py | 2 +- run.js | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 run.js diff --git a/api/api/main/settings.py b/api/api/main/settings.py index c3d0e1b..a00d24f 100644 --- a/api/api/main/settings.py +++ b/api/api/main/settings.py @@ -84,7 +84,7 @@ if os.environ.get('CONTAINERIZED'): DATABASES = { - 'default': { + 'default': { 'ENGINE': 'django.db.backends.postgresql_psycopg2', 'NAME': 'postgres', 'USER': 'postgres', diff --git a/run.js b/run.js new file mode 100644 index 0000000..9239263 --- /dev/null +++ b/run.js @@ -0,0 +1,24 @@ +'use strict' +const spawn = require('child_process').spawn; +const exec = require('child_process').exec; + + +const events = spawn('docker', ['events', '-f', 'container=hansard_api']) +const compose = spawn('docker-compose', ['up']) + +compose.stdout.on('data', (data) => { + console.log(data.toString()); +}) + +compose.stderr.on('data', (data) => { + console.log(data.toString()); +}) + +events.stdout.on('data', (data) => { + if (data.toString().indexOf('attach') !== -1){ + exec('docker exec hansard_api_1 python api/manage.py migrate', (error, stdout, stderr) =>{ + console.log(stdout) + console.log(stderr) + }) + } +}); From 128d8a0f961d82eef336536f25bb14d78e312036 Mon Sep 17 00:00:00 2001 From: timini Date: Sun, 7 Aug 2016 02:46:17 +0100 Subject: [PATCH 04/12] add iepy, and Makefile --- Makefile | 55 ++++++ docker-compose.yml => docker-compose-api.yml | 5 +- docker-compose-iepy.yml | 8 + iepy/Dockerfile | 21 +++ iepy/docker-compose.yml | 10 + iepy/hansard/__init__.py | 1 + iepy/hansard/bin/csv_to_iepy.py | 28 +++ iepy/hansard/bin/gazettes_loader.py | 76 ++++++++ iepy/hansard/bin/iepy_rules_runner.py | 59 ++++++ iepy/hansard/bin/iepy_runner.py | 181 +++++++++++++++++++ iepy/hansard/bin/manage.py | 12 ++ iepy/hansard/bin/preprocess.py | 88 +++++++++ iepy/hansard/bin/rules_verifier.py | 149 +++++++++++++++ iepy/hansard/extractor_config.json | 20 ++ iepy/hansard/rules.py | 2 + iepy/hansard/settings.py | 35 ++++ run.js | 24 --- 17 files changed, 747 insertions(+), 27 deletions(-) create mode 100644 Makefile rename docker-compose.yml => docker-compose-api.yml (83%) create mode 100644 docker-compose-iepy.yml create mode 100644 iepy/Dockerfile create mode 100644 iepy/docker-compose.yml create mode 100644 iepy/hansard/__init__.py create mode 100644 iepy/hansard/bin/csv_to_iepy.py create mode 100644 iepy/hansard/bin/gazettes_loader.py create mode 100644 iepy/hansard/bin/iepy_rules_runner.py create mode 100644 iepy/hansard/bin/iepy_runner.py create mode 100644 iepy/hansard/bin/manage.py create mode 100644 iepy/hansard/bin/preprocess.py create mode 100644 iepy/hansard/bin/rules_verifier.py create mode 100644 iepy/hansard/extractor_config.json create mode 100644 iepy/hansard/rules.py create mode 100644 iepy/hansard/settings.py delete mode 100644 run.js diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..18ec9e7 --- /dev/null +++ b/Makefile @@ -0,0 +1,55 @@ +all: iepy api + +stop: docker-stop-api docker-stop-iepy + +# api ------------------ + +docker-build-api: + docker-compose -f docker-compose-api.yml build + +docker-up-api: docker-stop-api docker-build-api + docker-compose -f docker-compose-api.yml up -d + +docker-stop-api: + docker-compose -f docker-compose-api.yml stop + +docker-rm-api: docker-stop-api + docker-compose -f docker-compose-api.yml rm -f + +migrate-api: docker-up-api + docker exec hansard_api_1 python api/manage.py migrate + +createsuperuser-api: migrate-api + docker exec hansard_api_1 bash -c "echo \"from users.models import User; User.objects.create_superuser('admin', 'tim@rewire.it', 'sn0wb1rd')\" | python api/manage.py shell" + +init-data: migrate-api + docker exec -ti hansard_api_1 python api/manage.py init_data + +test-api: migrate-api + docker exec -ti hansard_api_1 bash -c "cd api && python manage.py test" + +api: createsuperuser-api + echo "api running on localhost:3000" + +#-------------------- + +docker-build-iepy: + docker-compose -f docker-compose-iepy.yml build + +docker-up-iepy: docker-stop-iepy docker-build-iepy + docker-compose -f docker-compose-iepy.yml up -d + +docker-stop-iepy: + docker-compose -f docker-compose-iepy.yml stop + +docker-rm-iepy: docker-stop-iepy + docker-compose -f docker-compose-iepy.yml rm -f + +migrate-iepy: docker-up-iepy + docker exec hansard_iepy_1 python /hansard/bin/manage.py migrate + +createsuperuser-iepy: migrate-iepy + docker exec hansard_iepy_1 bash -c "echo \"from django.contrib.auth.models import User; User.objects.create_superuser('admin', 'tim@rewire.it', 'sn0wb1rd')\" | python /hansard/bin/manage.py shell" + +iepy: createsuperuser-iepy + echo "iepy running on localhost:8001" diff --git a/docker-compose.yml b/docker-compose-api.yml similarity index 83% rename from docker-compose.yml rename to docker-compose-api.yml index d6ce531..0d411e9 100644 --- a/docker-compose.yml +++ b/docker-compose-api.yml @@ -1,15 +1,14 @@ db: image: postgres -search: - image: elasticsearch api: build: ./api command: python api/manage.py runserver 0.0.0.0:8000 ports: - "8000:8000" + volumes: + - ./api:/app links: - db - - search # client: # build: ./client # command: ember server diff --git a/docker-compose-iepy.yml b/docker-compose-iepy.yml new file mode 100644 index 0000000..6c24ebc --- /dev/null +++ b/docker-compose-iepy.yml @@ -0,0 +1,8 @@ +dbiepy: + image: postgres +iepy: + build: ./iepy + ports: + - "8001:8001" + links: + - dbiepy diff --git a/iepy/Dockerfile b/iepy/Dockerfile new file mode 100644 index 0000000..1263dce --- /dev/null +++ b/iepy/Dockerfile @@ -0,0 +1,21 @@ +from python:3.4 + +RUN pip install numpy + +RUN pip install iepy + +RUN apt-get update && apt-get install -y openjdk-7-jre + +ENV JAVAHOME=/usr/bin/java + +RUN mkdir /root/.config + +RUN iepy --download-third-party-data + +COPY ./hansard /hansard + +RUN pip install psycopg2 + +EXPOSE 8001 + +CMD python /hansard/bin/manage.py runserver 0.0.0.0:8001 diff --git a/iepy/docker-compose.yml b/iepy/docker-compose.yml new file mode 100644 index 0000000..5f9a4f1 --- /dev/null +++ b/iepy/docker-compose.yml @@ -0,0 +1,10 @@ +db: + image: postgres +iepy: + build: . + ports: + - "8000:8000" + volumes: + - ./hansard:/hansard + links: + - db diff --git a/iepy/hansard/__init__.py b/iepy/hansard/__init__.py new file mode 100644 index 0000000..5c1a7f9 --- /dev/null +++ b/iepy/hansard/__init__.py @@ -0,0 +1 @@ +from . import rules \ No newline at end of file diff --git a/iepy/hansard/bin/csv_to_iepy.py b/iepy/hansard/bin/csv_to_iepy.py new file mode 100644 index 0000000..002e5d1 --- /dev/null +++ b/iepy/hansard/bin/csv_to_iepy.py @@ -0,0 +1,28 @@ +""" +IEPY database loader from csv file + +Usage: + csv_to_iepy.py + csv_to_iepy.py -h | --help + +The argument can be a .csv file or a .csv.gz file containing the +corpus in two columns: 'freebase_mid' and 'description'. + +Options: + -h --help Show this screen + --version Version number +""" + +import logging + +from docopt import docopt + +import iepy +iepy.setup(__file__) +from iepy.utils import csv_to_iepy + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(message)s') + opts = docopt(__doc__, version=iepy.__version__) + filepath = opts[""] + csv_to_iepy(filepath) diff --git a/iepy/hansard/bin/gazettes_loader.py b/iepy/hansard/bin/gazettes_loader.py new file mode 100644 index 0000000..26322d1 --- /dev/null +++ b/iepy/hansard/bin/gazettes_loader.py @@ -0,0 +1,76 @@ +""" +IEPY gazettes loader + +Usage: + gazettes_loader.py + + +The argument can be a .csv file or a .csv.gz file containing the +gazettes in two columns: 'literal' and 'class'. + + +Options: + -h --help Show this screen +""" + +import sys +import csv +import gzip +import logging +from operator import itemgetter + +from django.db import IntegrityError +from docopt import docopt + +import iepy +iepy.setup(__file__) +from iepy.data.models import EntityKind, GazetteItem + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def add_gazettes_from_csv(filepath): + if filepath.endswith(".gz"): + fin = gzip.open(filepath, "rt") + else: + fin = open(filepath, "rt") + reader = csv.DictReader(fin) + + expected_fnames = ['literal', 'class'] + if not set(reader.fieldnames).issuperset(expected_fnames): + msg = "Couldn't find the expected field names on the provided csv: {}" + sys.exit(msg.format(expected_fnames)) + + _create_gazette_entries( + itemgetter(*expected_fnames)(line) for line in reader + ) + + +def _create_gazette_entries(entries_list): + kind_cache = {} + created = 0 + for literal, kind_name in entries_list: + literal = literal.strip() + kind_name = kind_name.strip() + kind = kind_cache.get(kind_name) + if kind is None: + kind, _ = EntityKind.objects.get_or_create(name=kind_name) + kind_cache[kind_name] = kind + gazette = GazetteItem(text=literal, kind=kind) + + try: + gazette.save() + except IntegrityError as error: + logging.warn( + "Gazette '{}' of class '{}' not loaded, literal already existed".format( + literal, kind_name)) + print(error) + finally: + created += 1 + print('Created {} new gazette items'.format(created)) + + +if __name__ == "__main__": + opts = docopt(__doc__, version=iepy.__version__) + fname = opts[""] + add_gazettes_from_csv(fname) diff --git a/iepy/hansard/bin/iepy_rules_runner.py b/iepy/hansard/bin/iepy_rules_runner.py new file mode 100644 index 0000000..5016ae4 --- /dev/null +++ b/iepy/hansard/bin/iepy_rules_runner.py @@ -0,0 +1,59 @@ +""" +Run IEPY rule-based extractor + +Usage: + iepy_rules_runner.py + iepy_rules_runner.py -h | --help | --version + +Picks from rules.py the relation to work with, and the rules definitions and +proceeds with the extraction. + +Options: + -h --help Show this screen + --version Version number +""" +import sys +import logging + +from django.core.exceptions import ObjectDoesNotExist + +import iepy +iepy.setup(__file__) + +from iepy.extraction.rules import load_rules +from iepy.extraction.rules_core import RuleBasedCore +from iepy.data import models, output +from iepy.data.db import CandidateEvidenceManager + + +def run_from_command_line(): + logging.basicConfig(level=logging.INFO, format='%(message)s') + + try: + relation_name = iepy.instance.rules.RELATION + except AttributeError: + logging.error("RELATION not defined in rules file") + sys.exit(1) + + try: + relation = models.Relation.objects.get(name=relation_name) + except ObjectDoesNotExist: + logging.error("Relation {!r} not found".format(relation_name)) + sys.exit(1) + + # Load rules + rules = load_rules() + + # Load evidences + evidences = CandidateEvidenceManager.candidates_for_relation(relation) + + # Run the pipeline + iextractor = RuleBasedCore(relation, rules) + iextractor.start() + iextractor.process() + predictions = iextractor.predict(evidences) + output.dump_output_loop(predictions) + + +if __name__ == u'__main__': + run_from_command_line() diff --git a/iepy/hansard/bin/iepy_runner.py b/iepy/hansard/bin/iepy_runner.py new file mode 100644 index 0000000..6f2ac1e --- /dev/null +++ b/iepy/hansard/bin/iepy_runner.py @@ -0,0 +1,181 @@ +""" +Run IEPY active-learning extractor + +Usage: + iepy_runner.py [options] + iepy_runner.py [options] --db-store + iepy_runner.py -h | --help | --version + +Options: + --store-extractor= Stores the trained classifier + --trained-extractor= Load an already trained extractor + --db-store Stores the predictions on the database + --no-questions Won't generate questions to answer. Will predict + as is. Should be used with --trained-extractor + --tune-for= Predictions tuning. Options are high-prec + or high-recall [default: high-prec] + --extractor-config= Sets the extractor config + --version Version number + -h --help Show this screen +""" + +import os +import json +import logging +from docopt import docopt +from sys import exit + +import iepy +INSTANCE_PATH = iepy.setup(__file__) + +from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC +from iepy.data.db import CandidateEvidenceManager +from iepy.data.models import Relation +from iepy.extraction.terminal import TerminalAdministration +from iepy.data import output + + +def print_all_relations(): + print("All available relations:") + for relation in Relation.objects.all(): + print(" {}".format(relation)) + + +def load_labeled_evidences(relation, evidences): + CEM = CandidateEvidenceManager # shorcut + return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins) + + +def _get_tuning_mode(opts): + if opts['--tune-for'] == 'high-prec': + tuning_mode = HIPREC + elif opts['--tune-for'] == 'high-recall': + tuning_mode = HIREC + else: + print ('Invalid tuning mode') + print (__doc__) + exit(1) + return tuning_mode + + +def _get_relation(opts): + relation_name = opts[''] + try: + relation = Relation.objects.get(name=relation_name) + except Relation.DoesNotExist: + print("Relation {!r} non existent".format(relation_name)) + print_all_relations() + exit(1) + return relation + + +def _load_extractor(opts, relation, labeled_evidences): + extractor_path = opts.get('--trained-extractor') + try: + iextractor = ActiveLearningCore.load(extractor_path, + labeled_evidences=labeled_evidences) + except ValueError: + print("Error: unable to load extractor, invalid file") + exit(1) + + if iextractor.relation != relation: + print('The loaded extractor is not for the requested relation' + ' but for relation {} instead'.format(iextractor.relation)) + exit(1) + print('Extractor successfully loaded') + return iextractor + + +def _construct_extractor(opts, relation, labeled_evidences, tuning_mode): + config_filepath = opts.get("--extractor-config") + if not config_filepath: + config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json") + + if not os.path.exists(config_filepath): + print("Error: extractor config does not exists, please create the " + "file extractor_config.json or use the --extractor-config") + exit(1) + + with open(config_filepath) as filehandler: + try: + extractor_config = json.load(filehandler) + except Exception as error: + print("Error: unable to load extractor config: {}".format(error)) + exit(1) + + iextractor = ActiveLearningCore( + relation, labeled_evidences, extractor_config, tradeoff=tuning_mode + ) + return iextractor + + +def run_from_command_line(): + opts = docopt(__doc__, version=iepy.__version__) + + logging.basicConfig(level=logging.INFO, format='%(message)s') + logging.getLogger("featureforge").setLevel(logging.WARN) + + tuning_mode = _get_tuning_mode(opts) + relation = _get_relation(opts) + + candidates = CandidateEvidenceManager.candidates_for_relation(relation) + labeled_evidences = load_labeled_evidences(relation, candidates) + + if opts.get('--trained-extractor'): + iextractor = _load_extractor(opts, relation, labeled_evidences) + was_ever_trained = True + opts["--no-questions"] = True + else: + iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode) + iextractor.start() + was_ever_trained = False + + if not opts.get("--no-questions", False): + questions_loop(iextractor, relation, was_ever_trained) + + # Predict and store output + predictions = iextractor.predict(candidates) # asking predictions for EVERYTHING + if not predictions: + print("Nothing was predicted") + exit(1) + + if opts.get("--db-store"): + output.dump_predictions_to_database(relation, predictions) + + output_file = opts.get("") + if output_file: + output.dump_runner_output_to_csv(predictions, output_file) + + classifier_output = opts.get("--store-extractor") + if classifier_output: + iextractor.save(classifier_output) + + +def questions_loop(iextractor, relation, was_ever_trained): + STOP = u'STOP' + term = TerminalAdministration( + relation, + extra_options=[(STOP, u'Stop execution')] + ) + while iextractor.questions: + questions = list(iextractor.questions) # copying the list + term.update_candidate_evidences_to_label(questions) + result = term() + i = 0 + for c, label_value in load_labeled_evidences(relation, questions).items(): + if label_value is not None: + iextractor.add_answer(c, label_value) + i += 1 + print ('Added %s new human labels to the extractor core' % i) + iextractor.process() + was_ever_trained = True + if result == STOP: + break + + if not was_ever_trained: + # It's needed to run some process before asking for predictions + iextractor.process() + + +if __name__ == u'__main__': + run_from_command_line() diff --git a/iepy/hansard/bin/manage.py b/iepy/hansard/bin/manage.py new file mode 100644 index 0000000..55f2fde --- /dev/null +++ b/iepy/hansard/bin/manage.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +import sys + +from django.core.management import execute_from_command_line + +import iepy +iepy.setup(__file__) + + +if __name__ == "__main__": + execute_from_command_line(sys.argv) diff --git a/iepy/hansard/bin/preprocess.py b/iepy/hansard/bin/preprocess.py new file mode 100644 index 0000000..125711d --- /dev/null +++ b/iepy/hansard/bin/preprocess.py @@ -0,0 +1,88 @@ +""" +Corpus preprocessing script + +Usage: + preprocess.py [options] + preprocess.py --split-in= --run-part= + preprocess.py --increment-ner + preprocess.py -h | --help | --version + +Options: + -h --help Show this screen + --multiple-cores= Number of cores (use all to use every processor) + --increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely. + --version Version number +""" +import logging + +from docopt import docopt + +import iepy +import multiprocessing +iepy.setup(__file__) +from iepy.data.db import DocumentManager +from iepy.preprocess.stanford_preprocess import StanfordPreprocess +from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps +from iepy.preprocess.segmenter import SyntacticSegmenterRunner + + +class ParallelDocManager(DocumentManager): + + def mines_of(self, qset, number_of_processors, my_id): + K = number_of_processors + N = my_id + clause = 'id %%%% %s = %s' % (K, N) + return qset.extra(where=[clause]) + +def start_preprocess(docs, increment_ner): + pipeline = PreProcessPipeline([ + StanfordPreprocess(increment_ner), + SyntacticSegmenterRunner(increment=True) + ], docs) + pipeline.process_everything() + +if __name__ == '__main__': + logger = logging.getLogger(u'preprocess') + logger.setLevel(logging.INFO) + logging.basicConfig(level=logging.INFO, format='%(message)s') + opts = docopt(__doc__, version=iepy.__version__) + increment_ner = opts['--increment-ner'] + + dm = ParallelDocManager() + all_docs = dm.get_documents_lacking_preprocess( + [PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing]) + + multiple_cores = opts.get('--multiple-cores') + split_in = opts.get("--split-in") + run_part = opts.get("--run-part") + + if multiple_cores: + if multiple_cores == "all": + multiple_cores = multiprocessing.cpu_count() + try: + multiple_cores = int(multiple_cores) + except ValueError: + logger.error("Invalid number of cores") + exit(1) + + for i in range(multiple_cores): + process = multiprocessing.Process( + target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner) + ) + process.start() + elif split_in: + try: + split_in = int(split_in) + run_part = int(run_part) - 1 + except ValueError: + logger.error("Invalid split") + exit(1) + + if run_part < 0 or run_part > split_in: + logger.error("Parts must be between 1 and {}".format(split_in)) + exit(1) + + docs = dm.mines_of(all_docs, split_in, run_part) + start_preprocess(docs, increment_ner) + else: + start_preprocess(all_docs, increment_ner) diff --git a/iepy/hansard/bin/rules_verifier.py b/iepy/hansard/bin/rules_verifier.py new file mode 100644 index 0000000..98fce50 --- /dev/null +++ b/iepy/hansard/bin/rules_verifier.py @@ -0,0 +1,149 @@ +""" +IEPY rules verifier + + +Usage: + rules_verifier.py [options] + +Options: + --shuffle Chooses the sample randomly and not the first ones + --create-evidences Creates evidences that are missing [default: false] + -r --rule= Tests only this rule + -l --limit= Limits the amount of evidences uses + -h --help Show this screen +""" + +import sys +import logging +from docopt import docopt + +import refo +from django.core.exceptions import ObjectDoesNotExist +from colorama import init as colorama_init + +import iepy +iepy.setup(__file__) + +from iepy.data import models +from iepy.data.models import EvidenceCandidate +from iepy.data.db import CandidateEvidenceManager +from iepy.extraction.terminal import TerminalEvidenceFormatter +from iepy.extraction.rules import ( + load_rules, compile_rule, generate_tokens_to_match +) +from iepy.metrics import result_dict_from_predictions + + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def run_from_command_line(): + opts = docopt(__doc__, version=iepy.__version__) + relation_name = opts.get("") + limit = opts.get("--limit") + rule_name = opts.get("--rule") + shuffle = opts.get("--shuffle") + create_evidences = opts.get("--create-evidences") + + if limit is None: + limit = -1 + + try: + limit = int(limit) + except ValueError: + logging.error("Invalid limit value, it must be a number") + sys.exit(1) + + try: + relation = models.Relation.objects.get(name=relation_name) + except ObjectDoesNotExist: + logging.error("Relation {!r} not found".format(relation_name)) + sys.exit(1) + + # Load rules + rules = get_rules(rule_name) + rule_regexes = [ + (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules + ] + + # Load evidences + if EvidenceCandidate.objects.all().count() == 0: + create_evidences = True + evidences = CandidateEvidenceManager.candidates_for_relation( + relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle + ) + conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins + answers = CandidateEvidenceManager.labels_for( + relation, evidences, conflict_solver + ) + run_tests(rule_regexes, evidences, answers) + + +def run_tests(rule_regexes, evidences, answers): + predictions = [] + real_labels = [] + evidences_with_labels = [] + + colorama_init() + formatter = TerminalEvidenceFormatter() + + for name, regex, answer in rule_regexes: + title = "Matches for rule '{}' (value: {})".format(name, answer) + print("\n{}\n{}".format(title, "-" * len(title))) + + anything_matched = False + for evidence in evidences: + tokens_to_match = generate_tokens_to_match(evidence) + match = refo.match(regex, tokens_to_match) + + if match: + anything_matched = True + print(" * {}".format(formatter.colored_text(evidence))) + + if evidence in answers and answers[evidence] is not None: + evidences_with_labels.append(evidence) + real_labels.append(answers[evidence]) + + if match: + predictions.append(answer) + else: + predictions.append(False) + + if not anything_matched: + print(" nothing matched") + + print() + + if real_labels: + results = result_dict_from_predictions( + evidences_with_labels, real_labels, predictions + ) + results.pop("end_time") + keys = [ + "true_positives", "true_negatives", + "false_positives", "false_negatives", + "precision", "recall", + "accuracy", "f1", + ] + + title = "Metrics" + print("{}\n{}".format(title, "-" * len(title))) + for key in keys: + print("{:>15}: {:.2f}".format(key, results[key])) + + +def get_rules(rule_name): + # Load rules + rules = load_rules() + + if rule_name: + rules = [x for x in rules if x.__name__ == rule_name] + if not rules: + logging.error("rule '{}' does not exists".format(rule_name)) + sys.exit(1) + + return rules + + +if __name__ == "__main__": + run_from_command_line() diff --git a/iepy/hansard/extractor_config.json b/iepy/hansard/extractor_config.json new file mode 100644 index 0000000..2f406a7 --- /dev/null +++ b/iepy/hansard/extractor_config.json @@ -0,0 +1,20 @@ +{ + "classifier_args": {}, + "classifier": "svc", + "sparse_features": [ + "bag_of_words", + "bag_of_pos", + "bag_of_words_in_between", + "bag_of_pos_in_between" + ], + "dense_features": [ + "entity_order", + "entity_distance", + "other_entities_in_between", + "verbs_count_in_between", + "verbs_count", + "total_number_of_entities", + "symbols_in_between", + "number_of_tokens" + ] +} \ No newline at end of file diff --git a/iepy/hansard/rules.py b/iepy/hansard/rules.py new file mode 100644 index 0000000..8f5699f --- /dev/null +++ b/iepy/hansard/rules.py @@ -0,0 +1,2 @@ +# Write here your rules +# RELATION = 'your relation here' diff --git a/iepy/hansard/settings.py b/iepy/hansard/settings.py new file mode 100644 index 0000000..33ed886 --- /dev/null +++ b/iepy/hansard/settings.py @@ -0,0 +1,35 @@ +""" +For more information on this file, see +https://docs.djangoproject.com/en/1.7/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/1.7/ref/settings/ +""" + +from iepy.webui.webui.settings import * + +IEPY_VERSION = '0.9.5' +IEPY_LANG = 'en' +SECRET_KEY = 'ckje0v6lbcg!ru2nd4uqoxa_ijme84m71x$^7t&7#y73gal^+5' +DEBUG = True +TEMPLATE_DEBUG = True + +# Database +# https://docs.djangoproject.com/en/1.7/ref/settings/#databases +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'postgres', + 'USER': 'postgres', + 'HOST': 'dbiepy', + 'PORT': 5432, + } +} + +# For changing tokenization options, read here. +# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html +# You can use as key any of the "known options" listed on that page, and as value, +# use True or False (python names) for booleans, or strings when option requires a text +# CORENLP_TKN_OPTS = { +# 'latexQuotes': False +# } diff --git a/run.js b/run.js deleted file mode 100644 index 9239263..0000000 --- a/run.js +++ /dev/null @@ -1,24 +0,0 @@ -'use strict' -const spawn = require('child_process').spawn; -const exec = require('child_process').exec; - - -const events = spawn('docker', ['events', '-f', 'container=hansard_api']) -const compose = spawn('docker-compose', ['up']) - -compose.stdout.on('data', (data) => { - console.log(data.toString()); -}) - -compose.stderr.on('data', (data) => { - console.log(data.toString()); -}) - -events.stdout.on('data', (data) => { - if (data.toString().indexOf('attach') !== -1){ - exec('docker exec hansard_api_1 python api/manage.py migrate', (error, stdout, stderr) =>{ - console.log(stdout) - console.log(stderr) - }) - } -}); From 00b8ce9fdf8e3c96a9fec1b6f29c3b8f1faf0f9b Mon Sep 17 00:00:00 2001 From: timini Date: Sun, 7 Aug 2016 10:41:19 +0100 Subject: [PATCH 05/12] run travis tests inside docker --- .travis.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index addc912..ea99964 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,9 @@ -language: python -python: - - "3.4" +sudo: required + +services: + - docker # cd into the api project -before_install: cd api -install: pip install -r requirements.txt -# change from /api to /api/api -before_script: cd api -script: python manage.py test +before_install: + - make api +script: + - make test-api From d82a2a2539d9f4ad4e1701f59bcea288d2f877d6 Mon Sep 17 00:00:00 2001 From: timini Date: Sun, 7 Aug 2016 11:32:59 +0100 Subject: [PATCH 06/12] update makefile with task to wait for database --- .travis.yml | 2 +- Makefile | 9 ++++++--- docker-compose-api.yml | 2 +- docker-compose-iepy.yml | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index ea99964..8bea3d2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ sudo: required - +language: python services: - docker # cd into the api project diff --git a/Makefile b/Makefile index 18ec9e7..7e8c91c 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ stop: docker-stop-api docker-stop-iepy docker-build-api: docker-compose -f docker-compose-api.yml build -docker-up-api: docker-stop-api docker-build-api +docker-up-api: docker-rm-api docker-build-api docker-compose -f docker-compose-api.yml up -d docker-stop-api: @@ -16,7 +16,10 @@ docker-stop-api: docker-rm-api: docker-stop-api docker-compose -f docker-compose-api.yml rm -f -migrate-api: docker-up-api +wait-for-postgres: docker-up-api + docker exec hansard_db_1 bash -c "while ! pg_isready; do echo \"$(date) - waiting for database to start\"; sleep 3; done" + +migrate-api: docker-up-api wait-for-postgres docker exec hansard_api_1 python api/manage.py migrate createsuperuser-api: migrate-api @@ -29,7 +32,7 @@ test-api: migrate-api docker exec -ti hansard_api_1 bash -c "cd api && python manage.py test" api: createsuperuser-api - echo "api running on localhost:3000" + echo "api running on localhost:8000" #-------------------- diff --git a/docker-compose-api.yml b/docker-compose-api.yml index 0d411e9..ad94a54 100644 --- a/docker-compose-api.yml +++ b/docker-compose-api.yml @@ -1,5 +1,5 @@ db: - image: postgres + image: postgres:9.6 api: build: ./api command: python api/manage.py runserver 0.0.0.0:8000 diff --git a/docker-compose-iepy.yml b/docker-compose-iepy.yml index 6c24ebc..e67d94a 100644 --- a/docker-compose-iepy.yml +++ b/docker-compose-iepy.yml @@ -1,5 +1,5 @@ dbiepy: - image: postgres + image: postgres:9.6 iepy: build: ./iepy ports: From 5b8cdfdec492c9110255f7360f79d0dd735b2514 Mon Sep 17 00:00:00 2001 From: timini Date: Sun, 7 Aug 2016 11:40:49 +0100 Subject: [PATCH 07/12] change make targets --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7e8c91c..c468171 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ createsuperuser-api: migrate-api init-data: migrate-api docker exec -ti hansard_api_1 python api/manage.py init_data -test-api: migrate-api +test-api: docker exec -ti hansard_api_1 bash -c "cd api && python manage.py test" api: createsuperuser-api From a757b9f8fae652aca689897cc9efee6d0be4fe6a Mon Sep 17 00:00:00 2001 From: timini Date: Sat, 13 Aug 2016 02:08:12 +0100 Subject: [PATCH 08/12] push docker image on build --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8bea3d2..66dcd01 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,3 +7,6 @@ before_install: - make api script: - make test-api +after_success: + - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + - docker tag hansard_api_1:latest gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) From 8e38d791d2d6e1ce6da947409a1ea81a5e83ef1a Mon Sep 17 00:00:00 2001 From: timini Date: Sat, 13 Aug 2016 02:19:59 +0100 Subject: [PATCH 09/12] test travis config --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 66dcd01..09802bf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,4 +9,5 @@ script: - make test-api after_success: - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" - - docker tag hansard_api_1:latest gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) + - echo "$DOCKER_USERNAME-$DOCKER_PASSWORD" + - docker tag hansard_api_1 gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) From a851a4539265812247df1eca8ebac2256e55e292 Mon Sep 17 00:00:00 2001 From: timini Date: Sat, 13 Aug 2016 02:22:54 +0100 Subject: [PATCH 10/12] test travis config --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 09802bf..0c827bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,6 @@ before_install: script: - make test-api after_success: - - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" https://gcr.io - echo "$DOCKER_USERNAME-$DOCKER_PASSWORD" - docker tag hansard_api_1 gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) From 553c45245c4bdf12672617852f84dbe7989044fd Mon Sep 17 00:00:00 2001 From: timini Date: Sat, 13 Aug 2016 02:30:01 +0100 Subject: [PATCH 11/12] test travis config --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0c827bb..acbe74a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,5 +9,4 @@ script: - make test-api after_success: - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" https://gcr.io - - echo "$DOCKER_USERNAME-$DOCKER_PASSWORD" - - docker tag hansard_api_1 gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) + - docker tag hansard_api gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) From f81bd09fb863626956be7ff95fffe0d9b7961946 Mon Sep 17 00:00:00 2001 From: timini Date: Sat, 13 Aug 2016 02:32:28 +0100 Subject: [PATCH 12/12] test travis config --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index acbe74a..243a2d0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,3 +10,4 @@ script: after_success: - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" https://gcr.io - docker tag hansard_api gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) + - docker push gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD)