diff --git a/data/generated_assertions.jsonl b/data/generated_assertions.jsonl
new file mode 100644
index 0000000..314cb75
--- /dev/null
+++ b/data/generated_assertions.jsonl
@@ -0,0 +1,3 @@
+{"ids":"2020.acl-main.427","assertions":["We propose a semantic parsing dataset focused on instruction-driven communication with an agent in the game Minecraft.","The dataset consists of 7K human utterances and their corresponding parses.","Given proper world state, the parses can be interpreted and executed in game.","We report the performance of baseline models.","We analyze the successes and failures of baseline models."]}
+{"ids":"2020.acl-main.606","assertions":["This paper is concerned with semantic parsing for English as a second language (ESL).","The paper is motivated by the theoretical emphasis on the learning challenges that occur at the syntax-semantics interface during second language acquisition.","The task is formulated based on the divergence between literal and intended meanings.","The approach combines the complementary strengths of English Resource Grammar, a linguistically-precise hand-crafted deep grammar, and TLE, an existing manually annotated ESL UD-TreeBank, with a novel reranking model.","Experiments demonstrate that in comparison to human annotations, the method can obtain a very promising SemBanking quality.","By means of the newly created corpus, state-of-the-art semantic parsing as well as grammatical error correction models are evaluated.","The evaluation profiles the performance of neural NLP techniques for handling ESL data.","The evaluation suggests some research directions."]}
+{"ids":"2020.acl-main.608","assertions":["One daunting problem for semantic parsing is the scarcity of annotation.","Aiming to reduce nontrivial human labor, we propose a two-stage semantic parsing framework.","The first stage utilizes an unsupervised paraphrase model to convert an unlabeled natural language utterance into the canonical utterance.","The downstream naive semantic parser accepts the intermediate output and returns the target logical form.","The entire training process is split into two phases: pre-training and cycle learning.","Three tailored self-supervised tasks are introduced throughout training to activate the unsupervised paraphrase model.","Experimental results on benchmarks Overnight and GeoGranno demonstrate that our framework is effective and compatible with supervised training."]}
diff --git a/data/generated_relations.jsonl b/data/generated_relations.jsonl
new file mode 100644
index 0000000..0302cfc
--- /dev/null
+++ b/data/generated_relations.jsonl
@@ -0,0 +1,3 @@
+{"ids":"2020.acl-main.427","relations":[]}
+{"ids":"2020.acl-main.606","relations":[{"relation":"Background","argument1":2,"argument2":1,"argument1_text":"The task is formulated based on the divergence between literal and intended meanings.","argument2_text":"The paper is motivated by the theoretical emphasis on the learning challenges that occur at the syntax-semantics interface during second language acquisition."},{"relation":"Evidence","argument1":3,"argument2":2,"argument1_text":"The approach combines the complementary strengths of English Resource Grammar, a linguistically-precise hand-crafted deep grammar, and TLE, an existing manually annotated ESL UD-TreeBank, with a novel reranking model.","argument2_text":"The task is formulated based on the divergence between literal and intended meanings."},{"relation":"Evidence","argument1":4,"argument2":3,"argument1_text":"Experiments demonstrate that in comparison to human annotations, the method can obtain a very promising SemBanking quality.","argument2_text":"The approach combines the complementary strengths of English Resource Grammar, a linguistically-precise hand-crafted deep grammar, and TLE, an existing manually annotated ESL UD-TreeBank, with a novel reranking model."},{"relation":"Background","argument1":5,"argument2":4,"argument1_text":"By means of the newly created corpus, state-of-the-art semantic parsing as well as grammatical error correction models are evaluated.","argument2_text":"Experiments demonstrate that in comparison to human annotations, the method can obtain a very promising SemBanking quality."},{"relation":"Evidence","argument1":6,"argument2":5,"argument1_text":"The evaluation profiles the performance of neural NLP techniques for handling ESL data.","argument2_text":"By means of the newly created corpus, state-of-the-art semantic parsing as well as grammatical error correction models are evaluated."}]}
+{"ids":"2020.acl-main.608","relations":[{"relation":"Condition","argument1":3,"argument2":2,"argument1_text":"The downstream naive semantic parser accepts the intermediate output and returns the target logical form.","argument2_text":"The first stage utilizes an unsupervised paraphrase model to convert an unlabeled natural language utterance into the canonical utterance."},{"relation":"Background","argument1":4,"argument2":5,"argument1_text":"The entire training process is split into two phases: pre-training and cycle learning.","argument2_text":"Three tailored self-supervised tasks are introduced throughout training to activate the unsupervised paraphrase model."}]}
diff --git a/data/selected_paper_abstracts.jsonl b/data/selected_paper_abstracts.jsonl
new file mode 100644
index 0000000..0247559
--- /dev/null
+++ b/data/selected_paper_abstracts.jsonl
@@ -0,0 +1,50 @@
+{"ids":"2020.acl-main.427","abstracts":"We propose a semantic parsing dataset focused on instruction-driven communication with an agent in the game Minecraft. The dataset consists of 7K human utterances and their corresponding parses. Given proper world state, the parses can be interpreted and executed in game. We report the performance of baseline models, and analyze their successes and failures."}
+{"ids":"2020.acl-main.606","abstracts":"This paper is concerned with semantic parsing for English as a second language (ESL). Motivated by the theoretical emphasis on the learning challenges that occur at the syntax-semantics interface during second language acquisition, we formulate the task based on the divergence between literal and intended meanings. We combine the complementary strengths of English Resource Grammar, a linguistically-precise hand-crafted deep grammar, and TLE, an existing manually annotated ESL UD-TreeBank with a novel reranking model. Experiments demonstrate that in comparison to human annotations, our method can obtain a very promising SemBanking quality. By means of the newly created corpus, we evaluate state-of-the-art semantic parsing as well as grammatical error correction models. The evaluation profiles the performance of neural NLP techniques for handling ESL data and suggests some research directions."}
+{"ids":"2020.acl-main.608","abstracts":"One daunting problem for semantic parsing is the scarcity of annotation. Aiming to reduce nontrivial human labor, we propose a two-stage semantic parsing framework, where the first stage utilizes an unsupervised paraphrase model to convert an unlabeled natural language utterance into the canonical utterance. The downstream naive semantic parser accepts the intermediate output and returns the target logical form. Furthermore, the entire training process is split into two phases: pre-training and cycle learning. Three tailored self-supervised tasks are introduced throughout training to activate the unsupervised paraphrase model. Experimental results on benchmarks Overnight and GeoGranno demonstrate that our framework is effective and compatible with supervised training."}
+{"ids":"2020.acl-main.742","abstracts":"We study the task of cross-database semantic parsing (XSP), where a system that maps natural language utterances to executable SQL queries is evaluated on databases unseen during training. Recently, several datasets, including Spider, were proposed to support development of XSP systems. We propose a challenging evaluation setup for cross-database semantic parsing, focusing on variation across database schemas and in-domain language use. We re-purpose eight semantic parsing datasets that have been well-studied in the setting where in-domain training data is available, and instead use them as additional evaluation data for XSP systems instead. We build a system that performs well on Spider, and find that it struggles to generalize to our re-purposed set. Our setup uncovers several generalization challenges for cross-database semantic parsing, demonstrating the need to use and develop diverse training and evaluation datasets."}
+{"ids":"2020.acl-main.746","abstracts":"We introduce a transductive model for parsing into Universal Decompositional Semantics (UDS) representations, which jointly learns to map natural language utterances into UDS graph structures and annotate the graph with decompositional semantic attribute scores. We also introduce a strong pipeline model for parsing into the UDS graph structure, and show that our transductive parser performs comparably while additionally performing attribute prediction. By analyzing the attribute prediction errors, we find the model captures natural relationships between attribute groups."}
+{"ids":"2020.acl-demos.29","abstracts":"The reader of a choose your own adventure novel and the user of a modern virtual assistant have a subtle similarity; both may, through the right lens, be viewed as engaging with a work of Interactive Fiction. This literary form emerged in the 1970s and has grown like a vine along the branch of modern technology, one guided by the advances of the other. In this work we weave together threads from the Interactive Fiction community and neural semantic parsing for dialog systems, defining the data model and necessary algorithms for a novel type of Interactive Fiction and open sourcing its accompanying authoring tool. Specifically, our work integrates retrieval based semantic parsing predicates into the branching story structures well known to the Interactive Fiction community, relaxing the relatively strict lexical options of preexisting systems."}
+{"ids":"2020.alta-1.16","abstracts":"In neural semantic parsing, sentences are mapped to meaning representations using encoder-decoder frameworks. In this paper, we propose to apply the Transformer architecture, instead of recurrent neural networks, to this task. Experiments in two data sets from different domains and with different levels of difficulty show that our model achieved better results than strong baselines in certain settings and competitive results across all our experiments."}
+{"ids":"2020.coling-main.226","abstracts":"Semantic parsing is the task of translating natural language utterances into machine-readable meaning representations. Currently, most semantic parsing methods are not able to utilize the contextual information (e.g. dialogue and comments history), which has a great potential to boost the semantic parsing systems. To address this issue, context dependent semantic parsing has recently drawn a lot of attention. In this survey, we investigate progress on the methods for the context dependent semantic parsing, together with the current datasets and tasks. We then point out open problems and challenges for future research in this area."}
+{"ids":"2020.coling-main.289","abstracts":"In this paper, we present a multi-level alignment pretraining method in a unified architecture formulti-lingual semantic parsing. In this architecture, we use an adversarial training method toalign the space of different languages and use sentence level and word level parallel corpus assupervision information to align the semantic of different languages. Finally, we jointly train themulti-level alignment and semantic parsing tasks. We conduct experiments on a publicly avail-able multi-lingual semantic parsing dataset ATIS and a newly constructed dataset. Experimentalresults show that our model outperforms state-of-the-art methods on both datasets."}
+{"ids":"2020.conll-1.40","abstracts":"Semantic parsing is one of the key components of natural language understanding systems. A successful parse transforms an input utterance to an action that is easily understood by the system. Many algorithms have been proposed to solve this problem, from conventional rule-based or statistical slot-filling systems to shift-reduce based neural parsers. For complex parsing tasks, the state-of-the-art method is based on an autoregressive sequence to sequence model that generates the parse directly. This model is slow at inference time, generating parses in O(n) decoding steps (n is the length of the target sequence). In addition, we demonstrate that this method performs poorly in zero-shot cross-lingual transfer learning settings. In this paper, we propose a non-autoregressive parser which is based on the insertion transformer to overcome these two issues. Our approach 1) speeds up decoding by 3x while outperforming the autoregressive model and 2) significantly improves cross-lingual transfer in the low-resource setting by 37% compared to autoregressive baseline. We test our approach on three wellknown monolingual datasets: ATIS, SNIPS and TOP. For cross-lingual semantic parsing, we use the MultiATIS++ and the multilingual TOP datasets."}
+{"ids":"2020.conll-shared.5","abstracts":"We present PERIN, a novel permutation-invariant approach to sentence-to-graph semantic parsing. PERIN is a versatile, cross-framework and language independent architecture for universal modeling of semantic structures. Our system participated in the CoNLL 2020 shared task, Cross-Framework Meaning Representation Parsing (MRP 2020), where it was evaluated on five different frameworks (AMR, DRG, EDS, PTG and UCCA) across four languages. PERIN was one of the winners of the shared task. The source code and pretrained models are available at http:\/\/www.github.com\/ufal\/perin."}
+{"ids":"2020.dmr-1.8","abstracts":"We propose an approach and a software framework for semantic parsing of natural language sentences to discourse representation structures with use of fuzzy meaning representations such as fuzzy sets and compatibility intervals. We explain the motivation for using fuzzy meaning representations in semantic parsing and describe the design of the proposed approach and the software framework, discussing various examples. We argue that the use of fuzzy meaning representations have potential to improve understanding and reasoning capabilities of systems working with natural language."}
+{"ids":"2020.emnlp-main.118","abstracts":"Meaning representation is an important component of semantic parsing. Although researchers have designed a lot of meaning representations, recent work focuses on only a few of them. Thus, the impact of meaning representation on semantic parsing is less understood. Furthermore, existing work\u2019s performance is often not comprehensively evaluated due to the lack of readily-available execution engines. Upon identifying these gaps, we propose , a new unified benchmark on meaning representations, by integrating existing semantic parsing datasets, completing the missing logical forms, and implementing the missing execution engines. The resulting unified benchmark contains the complete enumeration of logical forms and execution engines over three datasets \u00d7 four meaning representations. A thorough experimental study on Unimer reveals that neural semantic parsing approaches exhibit notably different performance when they are trained to generate different meaning representations. Also, program alias and grammar rules heavily impact the performance of different meaning representations. Our benchmark, execution engines and implementation can be found on: https:\/\/github.com\/JasperGuo\/Unimer."}
+{"ids":"2020.emnlp-main.323","abstracts":"AM dependency parsing is a linguistically principled method for neural semantic parsing with high accuracy across multiple graphbanks. It relies on a type system that models semantic valency but makes existing parsers slow. We describe an A* parser and a transition-based parser for AM dependency parsing which guarantee well-typedness and improve parsing speed by up to 3 orders of magnitude, while maintaining or improving accuracy."}
+{"ids":"2020.emnlp-main.371","abstracts":"We combine character-level and contextual language model representations to improve performance on Discourse Representation Structure parsing. Character representations can easily be added in a sequence-to-sequence model in either one encoder or as a fully separate encoder, with improvements that are robust to different language models, languages and data sets. For English, these improvements are larger than adding individual sources of linguistic information or adding non-contextual embeddings. A new method of analysis based on semantic tags demonstrates that the character-level representations improve performance across a subset of selected semantic phenomena."}
+{"ids":"2020.emnlp-main.408","abstracts":"The structured representation for semantic parsing in task-oriented assistant systems is geared towards simple understanding of one-turn queries. Due to the limitations of the representation, the session-based properties such as co-reference resolution and context carryover are processed downstream in a pipelined system. In this paper, we propose a semantic representation for such task-oriented conversational systems that can represent concepts such as co-reference and context carryover, enabling comprehensive understanding of queries in a session. We release a new session-based, compositional task-oriented parsing dataset of 20k sessions consisting of 60k utterances. Unlike Dialog State Tracking Challenges, the queries in the dataset have compositional forms. We propose a new family of Seq2Seq models for the session-based parsing above, which also set state-of-the-art in ATIS, SNIPS, TOP and DSTC2. Notably, we improve the best known results on DSTC2 by up to 5 points for slot-carryover."}
+{"ids":"2020.emnlp-main.413","abstracts":"Task-oriented semantic parsing is a critical component of virtual assistants, which is responsible for understanding the user\u2019s intents (set reminder, play music, etc.). Recent advances in deep learning have enabled several approaches to successfully parse more complex queries (Gupta et al., 2018; Rongali et al.,2020), but these models require a large amount of annotated training data to parse queries on new domains (e.g. reminder, music). In this paper, we focus on adapting task-oriented semantic parsers to low-resource domains, and propose a novel method that outperforms a supervised neural model at a 10-fold data reduction. In particular, we identify two fundamental factors for low-resource domain adaptation: better representation learning and better training techniques. Our representation learning uses BART (Lewis et al., 2019) to initialize our model which outperforms encoder-only pre-trained representations used in previous work. Furthermore, we train with optimization-based meta-learning (Finn et al., 2017) to improve generalization to low-resource domains. This approach significantly outperforms all baseline methods in the experiments on a newly collected multi-domain task-oriented semantic parsing dataset (TOPv2), which we release to the public."}
+{"ids":"2020.emnlp-main.558","abstracts":"We propose Grounded Adaptation for Zeroshot Executable Semantic Parsing (GAZP) to adapt an existing semantic parser to new environments (e.g. new database schemas). GAZP combines a forward semantic parser with a backward utterance generator to synthesize data (e.g. utterances and SQL queries) in the new environment, then selects cycle-consistent examples to adapt the parser. Unlike data-augmentation, which typically synthesizes unverified examples in the training environment, GAZP synthesizes examples in the new environment whose input-output consistency are verified through execution. On the Spider, Sparc, and CoSQL zero-shot semantic parsing tasks, GAZP improves logical form and execution accuracy of the baseline parser. Our analyses show that GAZP outperforms data-augmentation in the training environment, performance increases with the amount of GAZP-synthesized data, and cycle-consistency is central to successful adaptation."}
+{"ids":"2020.emnlp-main.577","abstracts":"Knowledge graphs (KGs) can vary greatly from one domain to another. Therefore supervised approaches to both graph-to-text generation and text-to-graph knowledge extraction (semantic parsing) will always suffer from a shortage of domain-specific parallel graph-text data; at the same time, adapting a model trained on a different domain is often impossible due to little or no overlap in entities and relations. This situation calls for an approach that (1) does not need large amounts of annotated data and thus (2) does not need to rely on domain adaptation techniques to work well on different domains. To this end, we present the first approach to unsupervised text generation from KGs and show simultaneously how it can be used for unsupervised semantic parsing. We evaluate our approach on WebNLG v2.1 and a new benchmark leveraging scene graphs from Visual Genome. Our system outperforms strong baselines for both text<->graph conversion tasks without any manual adaptation from one dataset to the other. In additional experiments, we investigate the impact of using different unsupervised objectives."}
+{"ids":"2020.emnlp-main.651","abstracts":"We consider a new perspective on dialog state tracking (DST), the task of estimating a user\u2019s goal through the course of a dialog. By formulating DST as a semantic parsing task over hierarchical representations, we can incorporate semantic compositionality, cross-domain knowledge sharing and co-reference. We present TreeDST, a dataset of 27k conversations annotated with tree-structured dialog states and system acts. We describe an encoder-decoder framework for DST with hierarchical representations, which leads to ~20% improvement over state-of-the-art DST approaches that operate on a flat meaning space of slot-value pairs."}
+{"ids":"2020.findings-emnlp.167","abstracts":"Large-scale semantic parsing datasets annotated with logical forms have enabled major advances in supervised approaches. But can richer supervision help even more? To explore the utility of fine-grained, lexical-level supervision, we introduce SQUALL, a dataset that enriches 11,276 WIKITABLEQUESTIONS English-language questions with manually created SQL equivalents plus alignments between SQL and question fragments. Our annotation enables new training possibilities for encoderdecoder models, including approaches from machine translation previously precluded by the absence of alignments. We propose and test two methods: (1) supervised attention; (2) adopting an auxiliary objective of disambiguating references in the input queries to table columns. In 5-fold cross validation, these strategies improve over strong baselines by 4.4% execution accuracy. Oracle experiments suggest that annotated alignments can support further accuracy gains of up to 23.9%."}
+{"ids":"2020.findings-emnlp.225","abstracts":"Generalization of models to out-of-distribution (OOD) data has captured tremendous attention recently. Specifically, compositional generalization, i.e., whether a model generalizes to new structures built of components observed during training, has sparked substantial interest. In this work, we investigate compositional generalization in semantic parsing, a natural test-bed for compositional generalization, as output programs are constructed from sub-components. We analyze a wide variety of models and propose multiple extensions to the attention module of the semantic parser, aiming to improve compositional generalization. We find that the following factors improve compositional generalization: (a) using contextual representations, such as ELMo and BERT, (b) informing the decoder what input tokens have previously been attended to, (c) training the decoder attention to agree with pre-computed token alignments, and (d) downsampling examples corresponding to frequent program templates. While we substantially reduce the gap between in-distribution and OOD generalization, performance on OOD compositions is still substantially lower."}
+{"ids":"2020.findings-emnlp.255","abstracts":"The celebrated Seq2Seq technique and its numerous variants achieve excellent performance on many tasks such as neural machine translation, semantic parsing, and math word problem solving. However, these models either only consider input objects as sequences while ignoring the important structural information for encoding, or they simply treat output objects as sequence outputs instead of structural objects for decoding. In this paper, we present a novel Graph-to-Tree Neural Networks, namely Graph2Tree consisting of a graph encoder and a hierarchical tree decoder, that encodes an augmented graph-structured input and decodes a tree-structured output. In particular, we investigated our model for solving two problems, neural semantic parsing and math word problem. Our extensive experiments demonstrate that our Graph2Tree model outperforms or matches the performance of other state-of-the-art models on these tasks."}
+{"ids":"2020.findings-emnlp.270","abstracts":"In specific domains, such as procedural scientific text, human labeled data for shallow semantic parsing is especially limited and expensive to create. Fortunately, such specific domains often use rather formulaic writing, such that the different ways of expressing relations in a small number of grammatically similar labeled sentences may provide high coverage of semantic structures in the corpus, through an appropriately rich similarity metric. In light of this opportunity, this paper explores an instance-based approach to the relation prediction sub-task within shallow semantic parsing, in which semantic labels from structurally similar sentences in the training set are copied to test sentences. Candidate similar sentences are retrieved using SciBERT embeddings. For labels where it is possible to copy from a similar sentence we employ an instance level copy network, when this is not possible, a globally shared parametric model is employed. Experiments show our approach outperforms both baseline and prior methods by 0.75 to 3 F1 absolute in the Wet Lab Protocol Corpus and 1 F1 absolute in the Materials Science Procedural Text Corpus."}
+{"ids":"2020.findings-emnlp.320","abstracts":"Semantic parses are directed acyclic graphs (DAGs), but in practice most parsers treat them as strings or trees, mainly because models that predict graphs are far less understood. This simplification, however, comes at a cost: there is no guarantee that the output is a well-formed graph. A recent work by Fancellu et al. (2019) addressed this problem by proposing a graph-aware sequence model that utilizes a DAG grammar to guide graph generation. We significantly improve upon this work, by proposing a simpler architecture as well as more efficient training and inference algorithms that can always guarantee the well-formedness of the generated graphs. Importantly, unlike Fancellu et al., our model does not require language-specific features, and hence can harness the inherent ability of DAG-grammar parsing in multilingual settings. We perform monolingual as well as multilingual experiments on the Parallel Meaning Bank (Abzianidze et al., 2017). Our parser outperforms previous graph-aware models by a large margin, and closes the performance gap between string-based and DAG-grammar parsing."}
+{"ids":"2020.findings-emnlp.364","abstracts":"Semantic parsing is an important NLP task. However, Vietnamese is a low-resource language in this research area. In this paper, we present the first public large-scale Text-to-SQL semantic parsing dataset for Vietnamese. We extend and evaluate two strong semantic parsing baselines EditSQL (Zhang et al., 2019) and IRNet (Guo et al., 2019) on our dataset. We compare the two baselines with key configurations and find that: automatic Vietnamese word segmentation improves the parsing results of both baselines; the normalized pointwise mutual information (NPMI) score (Bouma, 2009) is useful for schema linking; latent syntactic features extracted from a neural dependency parser for Vietnamese also improve the results; and the monolingual language model PhoBERT for Vietnamese (Nguyen and Nguyen, 2020) helps produce higher performances than the recent best multilingual language model XLM-R (Conneau et al., 2020)."}
+{"ids":"2020.findings-emnlp.423","abstracts":"The current state-of-the-art task-oriented semantic parsing models use BERT or RoBERTa as pretrained encoders; these models have huge memory footprints. This poses a challenge to their deployment for voice assistants such as Amazon Alexa and Google Assistant on edge devices with limited memory budgets. We propose to learn compositional code embeddings to greatly reduce the sizes of BERT-base and RoBERTa-base. We also apply the technique to DistilBERT, ALBERT-base, and ALBERT-large, three already compressed BERT variants which attain similar state-of-the-art performances on semantic parsing with much smaller model sizes. We observe 95.15% 98.46% embedding compression rates and 20.47% 34.22% encoder compression rates, while preserving >97.5% semantic parsing performances. We provide the recipe for training and analyze the trade-off between code embedding sizes and downstream performances."}
+{"ids":"2020.findings-emnlp.438","abstracts":"We present BRIDGE, a powerful sequential architecture for modeling dependencies between natural language questions and relational databases in cross-DB semantic parsing. BRIDGE represents the question and DB schema in a tagged sequence where a subset of the fields are augmented with cell values mentioned in the question. The hybrid sequence is encoded by BERT with minimal subsequent layers and the text-DB contextualization is realized via the fine-tuned deep attention in BERT. Combined with a pointer-generator decoder with schema-consistency driven search space pruning, BRIDGE attained state-of-the-art performance on the well-studied Spider benchmark (65.5% dev, 59.2% test), despite being much simpler than most recently proposed models for this task. Our analysis shows that BRIDGE effectively captures the desired cross-modal dependencies and has the potential to generalize to more text-DB related tasks. Our model implementation is available at https:\/\/github.com\/salesforce\/TabularSemanticParsing."}
+{"ids":"2020.intexsempar-1.2","abstracts":"Collecting training data for semantic parsing is a time-consuming and expensive task. As a result, there is growing interest in industry to reduce the number of annotations required to train a semantic parser, both to cut down on costs and to limit customer data handled by annotators. In this paper, we propose uncertainty and traffic-aware active learning, a novel active learning method that uses model confidence and utterance frequencies from customer traffic to select utterances for annotation. We show that our method significantly outperforms baselines on an internal customer dataset and the Facebook Task Oriented Parsing (TOP) dataset. On our internal dataset, our method achieves the same accuracy as random sampling with 2,000 fewer annotations."}
+{"ids":"2020.jeptalnrecital-demos.19","abstracts":"Nous pr\u00e9sentons des r\u00e9sum\u00e9s en fran\u00e7ais et en anglais de l\u2019article (Marzinotto et al., 2019) pr\u00e9sent\u00e9 \u00e0 la conf\u00e9rence North American Chapter of the Association for Computational Linguistics : Human Language Technologies en 2019."}
+{"ids":"2020.lrec-1.515","abstracts":"Frame-semantic annotations exist for a tiny fraction of the world\u2019s languages, Wikidata, however, links knowledge base triples to texts in many languages, providing a common, distant supervision signal for semantic parsers. We present WikiBank, a multilingual resource of partial semantic structures that can be used to extend pre-existing resources rather than creating new man-made resources from scratch. We also integrate this form of supervision into an off-the-shelf frame-semantic parser and allow cross-lingual transfer. Using Google\u2019s Sling architecture, we show significant improvements on the English and Spanish CoNLL 2009 datasets, whether training on the full available datasets or small subsamples thereof."}
+{"ids":"2020.lrec-1.714","abstracts":"Clinical trials often require that patients meet eligibility criteria (e.g., have specific conditions) to ensure the safety and the effectiveness of studies. However, retrieving eligible patients for a trial from the electronic health record (EHR) database remains a challenging task for clinicians since it requires not only medical knowledge about eligibility criteria, but also an adequate understanding of structured query language (SQL). In this paper, we introduce a new dataset that includes the first-of-its-kind eligibility-criteria corpus and the corresponding queries for criteria-to-sql (Criteria2SQL), a task translating the eligibility criteria to executable SQL queries. Compared to existing datasets, the queries in the dataset here are derived from the eligibility criteria of clinical trials and include Order-sensitive, Counting-based, and Boolean-type cases which are not seen before. In addition to the dataset, we propose a novel neural semantic parser as a strong baseline model. Extensive experiments show that the proposed parser outperforms existing state-of-the-art general-purpose text-to-sql models while highlighting the challenges presented by the new dataset. The uniqueness and the diversity of the dataset leave a lot of research opportunities for future improvement."}
+{"ids":"2020.spnlp-1.3","abstracts":"Modern conversational AI systems support natural language understanding for a wide variety of capabilities. While a majority of these tasks can be accomplished using a simple and flat representation of intents and slots, more sophisticated capabilities require complex hierarchical representations supported by semantic parsing. State-of-the-art semantic parsers are trained using supervised learning with data labeled according to a hierarchical schema which might be costly to obtain or not readily available for a new domain. In this work, we explore the possibility of generating synthetic data for neural semantic parsing using a pretrained denoising sequence-to-sequence model (i.e., BART). Specifically, we first extract masked templates from the existing labeled utterances, and then fine-tune BART to generate synthetic utterances conditioning on the extracted templates. Finally, we use an auxiliary parser (AP) to filter the generated utterances. The AP guarantees the quality of the generated data. We show the potential of our approach when evaluating on the Facebook TOP dataset for navigation domain."}
+{"ids":"2020.webnlg-1.13","abstracts":"We present a system for bilingual Data-ToText Generation and Semantic Parsing. We use a text-to-text generator to learn a single model that works for both languages on each of the tasks. The model is aided by machine translation during both pre-training and fine-tuning. We evaluate the system on WebNLG 2020 data 1 , which consists of RDF triples in English and natural language sentences in English and Russian for both the tasks. We achieve considerable gains over monolingual models, especially on unseen relations and Russian."}
+{"ids":"2021.acl-long.74","abstracts":"Despite the success of sequence-to-sequence (seq2seq) models in semantic parsing, recent work has shown that they fail in compositional generalization, i.e., the ability to generalize to new structures built of components observed during training. In this work, we posit that a span-based parser should lead to better compositional generalization. we propose SpanBasedSP, a parser that predicts a span tree over an input utterance, explicitly encoding how partial programs compose over spans in the input. SpanBasedSP extends Pasupat et al. (2019) to be comparable to seq2seq models by (i) training from programs, without access to gold trees, treating trees as latent variables, (ii) parsing a class of non-projective trees through an extension to standard CKY. On GeoQuery, SCAN and CLOSURE datasets, SpanBasedSP performs similarly to strong seq2seq baselines on random splits, but dramatically improves performance compared to baselines on splits that require compositional generalization: from 61.0 \u2192 88.9 average accuracy."}
+{"ids":"2021.acl-long.75","abstracts":"Sequence-to-sequence models excel at handling natural language variation, but have been shown to struggle with out-of-distribution compositional generalization. This has motivated new specialized architectures with stronger compositional biases, but most of these approaches have only been evaluated on synthetically-generated datasets, which are not representative of natural language variation. In this work we ask: can we develop a semantic parsing approach that handles both natural language variation and compositional generalization? To better assess this capability, we propose new train and test splits of non-synthetic datasets. We demonstrate that strong existing approaches do not perform well across a broad set of evaluations. We also propose NQG-T5, a hybrid model that combines a high-precision grammar-based approach with a pre-trained sequence-to-sequence model. It outperforms existing approaches across several compositional generalization challenges on non-synthetic data, while also being competitive with the state-of-the-art on standard evaluations. While still far from solving this problem, our study highlights the importance of diverse evaluations and the open challenge of handling both compositional generalization and natural language variation in semantic parsing."}
+{"ids":"2021.acl-long.284","abstracts":"Conversational semantic parsers map user utterances to executable programs given dialogue histories composed of previous utterances, programs, and system responses. Existing parsers typically condition on rich representations of history that include the complete set of values and computations previously discussed. We propose a model that abstracts over values to focus prediction on type- and function-level context. This approach provides a compact encoding of dialogue histories and predicted programs, improving generalization and computational efficiency. Our model incorporates several other components, including an atomic span copy operation and structural enforcement of well-formedness constraints on predicted programs, that are particularly advantageous in the low-data regime. Trained on the SMCalFlow and TreeDST datasets, our model outperforms prior work by 7.3% and 10.6% respectively in terms of absolute accuracy. Trained on only a thousand examples from each dataset, it outperforms strong baselines by 12.4% and 6.4%. These results indicate that simple representations are key to effective generalization in conversational semantic parsing."}
+{"ids":"2021.acl-long.397","abstracts":"Semantic parsing is challenging due to the structure gap and the semantic gap between utterances and logical forms. In this paper, we propose an unsupervised semantic parsing method - Synchronous Semantic Decoding (SSD), which can simultaneously resolve the semantic gap and the structure gap by jointly leveraging paraphrasing and grammar-constrained decoding. Specifically, we reformulate semantic parsing as a constrained paraphrasing problem: given an utterance, our model synchronously generates its canonical utterancel and meaning representation. During synchronously decoding: the utterance paraphrasing is constrained by the structure of the logical form, therefore the canonical utterance can be paraphrased controlledly; the semantic decoding is guided by the semantics of the canonical utterance, therefore its logical form can be generated unsupervisedly. Experimental results show that SSD is a promising approach and can achieve state-of-the-art unsupervised semantic parsing performance on multiple datasets."}
+{"ids":"2021.acl-short.22","abstracts":"The predominant challenge in weakly supervised semantic parsing is that of spurious programs that evaluate to correct answers for the wrong reasons. Prior work uses elaborate search strategies to mitigate the prevalence of spurious programs; however, they typically consider only one input at a time. In this work we explore the use of consistency between the output programs for related inputs to reduce the impact of spurious programs. We bias the program search (and thus the model\u2019s training signal) towards programs that map the same phrase in related inputs to the same sub-parts in their respective programs. Additionally, we study the importance of designing logical formalisms that facilitate this kind of consistency-based training. We find that a more consistent formalism leads to improved model performance even without consistency-based training. When combined together, these two insights lead to a 10% absolute improvement over the best prior result on the Natural Language Visual Reasoning dataset."}
+{"ids":"2021.adaptnlp-1.25","abstracts":"Many military communication domains involve rapidly conveying situation awareness with few words. Converting natural language utterances to logical forms in these domains is challenging, as these utterances are brief and contain multiple intents. In this paper, we present a first effort toward building a weakly-supervised semantic parser to transform brief, multi-intent natural utterances into logical forms. Our findings suggest a new \u201cprojection and reduction\u201d method that iteratively performs projection from natural to canonical utterances followed by reduction of natural utterances is the most effective. We conduct extensive experiments on two military and a general-domain dataset and provide a new baseline for future research toward accurate parsing of multi-intent utterances."}
+{"ids":"2021.cl-1.3","abstracts":"In this work, we present a phenomenon-oriented comparative analysis of the two dominant approaches in English Resource Semantic (ERS) parsing: classic, knowledge-intensive and neural, data-intensive models. To reflect state-of-the-art neural NLP technologies, a factorization-based parser is introduced that can produce Elementary Dependency Structures much more accurately than previous data-driven parsers. We conduct a suite of tests for different linguistic phenomena to analyze the grammatical competence of different parsers, where we show that, despite comparable performance overall, knowledge- and data-intensive models produce different types of errors, in a way that can be explained by their theoretical properties. This analysis is beneficial to in-depth evaluation of several representative parsing techniques and leads to new directions for parser development."}
+{"ids":"2021.eacl-main.66","abstracts":"Departing from both sequential pipelines and monotask systems, we propose Multiple Tasks Integration (MTI), a multitask paradigm orthogonal to weight sharing. The essence of MTI is to process the input iteratively but concurrently at multiple levels of analysis, where each decision is based on all of the structures that are already inferred and free from usual ordering constraints. We illustrate MTI with a system that performs part-of-speech tagging, syntactic dependency parsing and semantic dependency parsing. We observe that both the use of reinforcement learning and the release from sequential constraints are beneficial to the quality of the syntactic and semantic parses. We also observe that our model adopts an easy-first strategy that consists, on average, of predicting shorter dependencies before longer ones, but that syntax is not always tackled before semantics."}
+{"ids":"2021.eacl-main.87","abstracts":"Being able to parse code-switched (CS) utterances, such as Spanish+English or Hindi+English, is essential to democratize task-oriented semantic parsing systems for certain locales. In this work, we focus on Spanglish (Spanish+English) and release a dataset, CSTOP, containing 5800 CS utterances alongside their semantic parses. We examine the CS generalizability of various Cross-lingual (XL) models and exhibit the advantage of pre-trained XL language models when data for only one language is present. As such, we focus on improving the pre-trained models for the case when only English corpus alongside either zero or a few CS training instances are available. We propose two data augmentation methods for the zero-shot and the few-shot settings: fine-tune using translate-and-align and augment using a generation model followed by match-and-filter. Combining the few-shot setting with the above improvements decreases the initial 30-point accuracy gap between the zero-shot and the full-data settings by two thirds."}
+{"ids":"2021.eacl-main.109","abstracts":"In this work, we investigate the problems of semantic parsing in a few-shot learning setting. In this setting, we are provided with k utterance-logical form pairs per new predicate. The state-of-the-art neural semantic parsers achieve less than 25% accuracy on benchmark datasets when k = 1. To tackle this problem, we proposed to i) apply a designated meta-learning method to train the model; ii) regularize attention scores with alignment statistics; iii) apply a smoothing technique in pretraining. As a result, our method consistently outperforms all the baselines in both one and two-shot settings."}
+{"ids":"2021.eacl-main.150","abstracts":"Speech disfluencies are prevalent in spontaneous speech. The rising popularity of voice assistants presents a growing need to handle naturally occurring disfluencies. Semantic parsing is a key component for understanding user utterances in voice assistants, yet most semantic parsing research to date focuses on written text. In this paper, we investigate semantic parsing of disfluent speech with the ATIS dataset. We find that a state-of-the-art semantic parser does not seamlessly handle disfluencies. We experiment with adding real and synthetic disfluencies at training time and find that adding synthetic disfluencies not only improves model performance by up to 39% but can also outperform adding real disfluencies in the ATIS dataset."}
+{"ids":"2021.eacl-main.257","abstracts":"Scaling semantic parsing models for task-oriented dialog systems to new languages is often expensive and time-consuming due to the lack of available datasets. Available datasets suffer from several shortcomings: a) they contain few languages b) they contain small amounts of labeled examples per language c) they are based on the simple intent and slot detection paradigm for non-compositional queries. In this paper, we present a new multilingual dataset, called MTOP, comprising of 100k annotated utterances in 6 languages across 11 domains. We use this dataset and other publicly available datasets to conduct a comprehensive benchmarking study on using various state-of-the-art multilingual pre-trained models for task-oriented semantic parsing. We achieve an average improvement of +6.3 points on Slot F1 for the two existing multilingual datasets, over best results reported in their experiments. Furthermore, we demonstrate strong zero-shot performance using pre-trained models combined with automatic translation and alignment, and a proposed distant supervision method to reduce the noise in slot label projection."}
+{"ids":"2021.emnlp-main.314","abstracts":"Frame semantic parsing is a semantic analysis task based on FrameNet which has received great attention recently. The task usually involves three subtasks sequentially: (1) target identification, (2) frame classification and (3) semantic role labeling. The three subtasks are closely related while previous studies model them individually, which ignores their intern connections and meanwhile induces error propagation problem. In this work, we propose an end-to-end neural model to tackle the task jointly. Concretely, we exploit a graph-based method, regarding frame semantic parsing as a graph construction problem. All predicates and roles are treated as graph nodes, and their relations are taken as graph edges. Experiment results on two benchmark datasets of frame semantic parsing show that our method is highly competitive, resulting in better performance than pipeline models."}
+{"ids":"2021.emnlp-main.472","abstracts":"The availability of corpora has led to significant advances in training semantic parsers in English. Unfortunately, for languages other than English, annotated data is limited and so is the performance of the developed parsers. Recently, pretrained multilingual models have been proven useful for zero-shot cross-lingual transfer in many NLP tasks. What else does it require to apply a parser trained in English to other languages for zero-shot cross-lingual semantic parsing? Will simple language-independent features help? To this end, we experiment with six Discourse Representation Structure (DRS) semantic parsers in English, and generalize them to Italian, German and Dutch, where there are only a small number of manually annotated parses available. Extensive experiments show that despite its simplicity, adding Universal Dependency (UD) relations and Universal POS tags (UPOS) as model-agnostic features achieves surprisingly strong improvement on all parsers."}
+{"ids":"2021.emnlp-main.607","abstracts":"In practical applications of semantic parsing, we often want to rapidly change the behavior of the parser, such as enabling it to handle queries in a new domain, or changing its predictions on certain targeted queries. While we can introduce new training examples exhibiting the target behavior, a mechanism for enacting such behavior changes without expensive model re-training would be preferable. To this end, we propose ControllAble Semantic Parser via Exemplar Retrieval (CASPER). Given an input query, the parser retrieves related exemplars from a retrieval index, augments them to the query, and then applies a generative seq2seq model to produce an output parse. The exemplars act as a control mechanism over the generic generative model: by manipulating the retrieval index or how the augmented query is constructed, we can manipulate the behavior of the parser. On the MTOP dataset, in addition to achieving state-of-the-art on the standard setup, we show that CASPER can parse queries in a new domain, adapt the prediction toward the specified patterns, or adapt to new semantic schemas without having to further re-train the model."}
+{"ids":"2021.findings-emnlp.54","abstracts":"While neural networks are ubiquitous in state-of-the-art semantic parsers, it has been shown that most standard models suffer from dramatic performance losses when faced with compositionally out-of-distribution (OOD) data. Recently several methods have been proposed to improve compositional generalization in semantic parsing. In this work we instead focus on the problem of detecting compositionally OOD examples with neural semantic parsers, which, to the best of our knowledge, has not been investigated before. We investigate several strong yet simple methods for OOD detection based on predictive uncertainty. The experimental results demonstrate that these techniques perform well on the standard SCAN and CFQ datasets. Moreover, we show that OOD detection can be further improved by using a heterogeneous ensemble."}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..5799289
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "Clarus"
+description = "Clarus"
+authors = [{name = "JetBrains"}]
+version = "0.0.1"
+readme = "README.md"
+dynamic = ["dependencies"]
+requires-python = ">=3.10,<3.11.0 || >3.11.0,<3.13"
+
+[tool.poetry.group.dev.dependencies]
+acl-anthology = "^0.5.3"
+
+[tool.poetry.dependencies]
+numpy = "2.0"
+pandas = "^2.3.2"
+grazie-api-gateway-client = "^0.3.3"
+unsloth = "2025.8.1"
+textdistance = "^4.6.3"
+openai = "^1.107.3"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
diff --git a/src/get_assertions_and_relations.py b/src/get_assertions_and_relations.py
new file mode 100644
index 0000000..6ea9017
--- /dev/null
+++ b/src/get_assertions_and_relations.py
@@ -0,0 +1,218 @@
+import os
+import json
+import argparse
+import textdistance
+
+import pandas as pd
+
+from unsloth import FastLanguageModel
+from transformers import TextStreamer
+
+from typing import List
+
+# os.environ["GRAZIE_JWT_TOKEN"] = ""
+
+from grazie.api.client.endpoints import GrazieApiGatewayUrls
+from grazie.api.client.gateway import GrazieApiGatewayClient, GrazieAgent, AuthType
+from grazie.api.client.profiles import Profile
+from grazie.api.client.v8.chat.prompt import ChatPrompt
+from grazie.api.client.v8.llm_parameters import LLMParameters
+from grazie.api.client.v8.parameters import Parameters
+
+from utils.relation_prompts import relation2demo
+
+ASSERTION_PROMPT_FOR_GRAZIE_API_MODELS = "Your task is to segment the text into assertions, each standing as a separate claim."
+RELATION_PROMPT_FOR_GRAZIE_API_MODELS = "Your task is to annotate claims with the relations from the Rhetorical Structure Theory."
+
+model2profile = {"openai-o1": Profile.OPENAI_O_1, "openai-gpt-4o": Profile.OPENAI_GPT_4_O, "openai-chat-gpt": Profile.OPENAI_CHAT_GPT}
+
+def generate_with_unsloth(prompt_text: str, model, tokenizer, hyperparams):
+    messages = [
+        {"role": "user", "content": prompt_text}
+    ]
+
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,  # Must add for generation
+        enable_thinking=False,  # Disable thinking
+    )
+
+    generated = model.generate(
+        **tokenizer(text, return_tensors="pt").to("cuda"),
+        **hyperparams,
+        streamer=TextStreamer(tokenizer, skip_prompt=True),
+    )
+
+    generated_text = tokenizer.batch_decode(generated)[0]
+    if "<|im_start|>assistant" in generated_text:
+        generated_text = generated_text.split("<|im_start|>assistant")[1].strip()
+        generated_text = generated_text.split("<|im_end|>")[0].strip()
+    if "</think>" in generated_text:
+        generated_text = generated_text.split("</think>")[1]
+    print(generated_text, type(generated_text))
+
+    return generated_text.strip()
+
+
+def generate_with_openai(prompt_text: str, prompt_for_grazie_api_models: str, model, profile):
+    if profile.name == "openai-o1":
+        parameters = {}
+    else:
+        parameters = {LLMParameters.Temperature: Parameters.FloatValue(0.0)}
+    generated_text = model.v8.chat(
+        ChatPrompt()
+        .add_system(prompt_for_grazie_api_models)  # TODO: double-check this!
+        .add_user(prompt_text),
+        profile=profile,
+        parameters=parameters
+    ).content
+
+    return generated_text.strip()
+
+def extract_relations(relations, assertions):
+    output_relations = []
+    for relation in relations:
+        arg1 = None
+        arg2 = None
+        for assertion_idx, assertion in enumerate(assertions):
+            assertion = assertion.strip()
+            if len(assertion)==0: # TODO: double check if this happens!
+                continue
+            if arg1 and arg2:
+                output_relations.append({"relation": relation["relation"], "argument1": arg1, "argument2": arg2, "argument1_text": assertions[arg1], "argument2_text": assertions[arg2]})
+                break
+            if assertion == relation["argument1"]: #if textdistance.overlap(assertion, relation["argument1"])>0.99:
+                arg1 = assertion_idx
+            if assertion == relation["argument2"]: #textdistance.overlap(assertion, relation["argument2"])>0.99:
+                arg2 = assertion_idx
+
+    return output_relations
+
+
+def generate_assertions_and_relations(task: str, input_data_path: str, output_data_path: str, model_name_or_path: str, debug: bool):
+    output_annotations = []
+    ids = []
+
+    # Initializing the model and tokenizer (if used)
+    if "unsloth" in model_name_or_path.lower():
+        profile = None
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=model_name_or_path,
+            max_seq_length=4000,  # 4000, default 2048
+            load_in_4bit=True,  # 4bit uses much less memory
+            load_in_8bit=False,  # more accurate, uses 2x memory
+            full_finetuning=False,  # inference-only mode
+            # token = "hf_...",      # in case of gated models
+        )
+        hyperparams = {"max_new_tokens": 512, "temperature": 0.7, "top_p": 0.8,
+                       "top_k": 20}  # non-thinking mode
+
+    else:
+        profile = model2profile[model_name_or_path]
+        model = GrazieApiGatewayClient(
+                    grazie_agent=GrazieAgent(name="grazie-ml-dev", version="dev"),
+                    url=GrazieApiGatewayUrls.PRODUCTION, # PRODUCTION
+                    auth_type=AuthType.USER, # USER
+                )
+        tokenizer = None
+        hyperparams = dict()
+
+    if task == "assertions":
+        prompt_for_grazie_api_models = ASSERTION_PROMPT_FOR_GRAZIE_API_MODELS
+    elif task == "relations":
+        prompt_for_grazie_api_models = ASSERTION_PROMPT_FOR_GRAZIE_API_MODELS
+    else:
+        raise ValueError("Invalid task")
+
+    # Reading the data
+    df = pd.read_json(input_data_path, lines=True)
+    if debug:
+        df = df[:3]
+    for record in df.iterrows():
+        record = record[1]
+        if task == "assertions":
+            raw_inputs = record["abstracts"]
+        elif task == "relations":
+            raw_inputs = "\n".join(record["assertions"])
+        else:
+            raise ValueError("Invalid task")
+
+        idx = record["ids"]
+
+        if task == "assertions":
+            prompt_text = f"Given the following text your task is to output all claims (assertions) that appear in this text. Each claim must be on a new line: {raw_inputs}\n"
+        elif task == "relations":
+            relation_definitions = ""
+            for rel in relation2demo:
+                relation_definitions += "Relation: " + rel + " " + relation2demo[rel] + "\n"
+            prompt_text = f"Given the following claims your task is to output all possible relations between the claims. Relations can be: Evidence, Cause, Contrast, Condition, Background. Consider the following definitions: {relation_definitions}. Output JSON like in the example with one relation per line, you should identify all the relations present in the text: {{'relation': 'Relation', 'argument1':Argument1, 'argument2':Argument2}}\n {raw_inputs}"
+        else:
+            raise ValueError("Invalid task")
+
+        if "unsloth" in model_name_or_path.lower():
+            generated_text = generate_with_unsloth(prompt_text, model, tokenizer, hyperparams)
+        else:
+            generated_text = generate_with_openai(prompt_text, prompt_for_grazie_api_models, model, profile)
+
+        ids.append(idx)
+
+        if task == "assertions":
+            assertions = generated_text.split("\n")
+            processed_assertions = []
+            for assertion in assertions:
+                if assertion[0].isdigit() or assertion.startswith("-"):
+                    if " " in assertion:
+                        assertion = assertion[assertion.index(" ")+1:]
+                processed_assertions.append(assertion.strip())
+            output_annotations.append(processed_assertions)
+
+        elif task == "relations":
+            try:
+                # OpenAI sometimes generates extra characters or misses brackets which results in failed parsing
+                generated_text = generated_text.replace("```json", "")
+                generated_text = generated_text.replace("```", "")
+                if "[" in generated_text:
+                    generated_text = generated_text[generated_text.index("["):]
+                if "]" in generated_text:
+                    generated_text = generated_text[:generated_text.rindex("]") + 1]
+                generated_text = generated_text.replace("'", "\"")
+                generated_text = generated_text.replace("}\n{", "},\n{")
+                if not generated_text.startswith("["):
+                    generated_text = "[ " + generated_text
+                if not generated_text.endswith("]"):
+                    generated_text = generated_text + "]"
+
+                generated_text_as_json = json.loads(generated_text)
+                extracted_relations = extract_relations(generated_text_as_json, record["assertions"])
+                output_annotations.append(extracted_relations)
+            except Exception as e:
+                print(e)
+                print(f"Could not parse to JSON: {generated_text}")
+
+    if task == "assertions":
+        assertions_with_ids = []
+        for assertions, idx in zip(output_annotations, ids):
+            assertions_with_ids.append({"ids": idx, "assertions": assertions})
+        output_df = pd.DataFrame(assertions_with_ids)
+        output_df.to_json(output_data_path, orient="records", lines=True) # orient="index", indent=2)
+    elif task == "relations":
+        relations_with_ids = []
+        for relation, idx in zip(output_annotations, ids):
+            relations_with_ids.append({"ids": idx, "relations": relation})
+        output_df = pd.DataFrame(relations_with_ids)
+        output_df.to_json(output_data_path, orient="records", lines=True) # orient="index", indent=2)
+
+    print(f"Finished! The outputs are stored in {output_data_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str, choices=["assertions", "relations"], default="assertions")
+    parser.add_argument("--model_name_or_path", type=str, default="openai-gpt-4o") # "unsloth/Qwen3-14B-unsloth-bnb-4bit"
+    parser.add_argument("--input_data_path", type=str, default="data/selected_paper_abstracts.jsonl")
+    parser.add_argument("--output_data_path", type=str, default="data/generated_assertions.jsonl")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    generate_assertions_and_relations(args.task, args.input_data_path, args.output_data_path, args.model_name_or_path, args.debug)
diff --git a/src/get_assertions_and_relations_openai.py b/src/get_assertions_and_relations_openai.py
new file mode 100644
index 0000000..9613819
--- /dev/null
+++ b/src/get_assertions_and_relations_openai.py
@@ -0,0 +1,137 @@
+import os
+import json
+import argparse
+
+import textdistance
+import openai
+
+import pandas as pd
+
+
+from utils.relation_prompts import relation2demo
+
+
+def generate_with_openai(prompt_text: str, model_name: str, client):
+    messages = [{"role": "user", "content": prompt_text}]
+    response = client.chat.completions.create(messages=messages, model=model_name, temperature=0)
+    if response.choices:
+        # Selecting the first choice
+        return response.choices[0].message.content.strip()
+    else:
+        return "No response generated."
+
+
+def extract_relations(relations, assertions):
+    output_relations = []
+    for relation in relations:
+        arg1 = None
+        arg2 = None
+        for assertion_idx, assertion in enumerate(assertions):
+            assertion = assertion.strip()
+            if len(assertion)==0: # TODO: double check if this happens!
+                continue
+            if arg1 and arg2:
+                output_relations.append({"relation": relation["relation"], "argument1": arg1, "argument2": arg2, "argument1_text": assertions[arg1], "argument2_text": assertions[arg2]})
+                break
+            if assertion == relation["argument1"]: #if textdistance.overlap(assertion, relation["argument1"])>0.99:
+                arg1 = assertion_idx
+            if assertion == relation["argument2"]: #textdistance.overlap(assertion, relation["argument2"])>0.99:
+                arg2 = assertion_idx
+
+    return output_relations
+
+
+def generate_assertions_and_relations(task: str, input_data_path: str, output_data_path: str, model_name_or_path: str, debug: bool):
+    output_annotations = []
+    ids = []
+
+    client = openai.OpenAI()
+
+    # Reading the data
+    df = pd.read_json(input_data_path, lines=True)
+    if debug:
+        df = df[:3]
+    for record in df.iterrows():
+        record = record[1]
+        if task == "assertions":
+            raw_inputs = record["abstracts"]
+        elif task == "relations":
+            raw_inputs = "\n".join(record["assertions"])
+        else:
+            raise ValueError("Invalid task")
+
+        idx = record["ids"]
+
+        if task == "assertions":
+            prompt_text = f"Given the following text your task is to output all claims (assertions) that appear in this text. Each claim must be on a new line: {raw_inputs}\n"
+        elif task == "relations":
+            relation_definitions = ""
+            for rel in relation2demo:
+                relation_definitions += "Relation: " + rel + " " + relation2demo[rel] + "\n"
+            prompt_text = f"Given the following claims your task is to output all possible relations between the claims. Relations can be: Evidence, Cause, Contrast, Condition, Background. Consider the following definitions: {relation_definitions}. Output JSON like in the example with one relation per line, you should identify all the relations present in the text: {{'relation': 'Relation', 'argument1':Argument1, 'argument2':Argument2}}\n {raw_inputs}"
+        else:
+            raise ValueError("Invalid task")
+
+        generated_text = generate_with_openai(prompt_text, model_name_or_path, client)
+
+        ids.append(idx)
+
+        if task == "assertions":
+            assertions = generated_text.split("\n")
+            processed_assertions = []
+            for assertion in assertions:
+                if assertion[0].isdigit() or assertion.startswith("-"):
+                    if " " in assertion:
+                        assertion = assertion[assertion.index(" ")+1:]
+                processed_assertions.append(assertion.strip())
+            output_annotations.append(processed_assertions)
+
+        elif task == "relations":
+            try:
+                # OpenAI sometimes generates extra characters or misses brackets which results in failed parsing
+                generated_text = generated_text.replace("```json", "")
+                generated_text = generated_text.replace("```", "")
+                if "[" in generated_text:
+                    generated_text = generated_text[generated_text.index("["):]
+                if "]" in generated_text:
+                    generated_text = generated_text[:generated_text.rindex("]") + 1]
+                generated_text = generated_text.replace("'", "\"")
+                generated_text = generated_text.replace("}\n{", "},\n{")
+                if not generated_text.startswith("["):
+                    generated_text = "[ " + generated_text
+                if not generated_text.endswith("]"):
+                    generated_text = generated_text + "]"
+
+                generated_text_as_json = json.loads(generated_text)
+                extracted_relations = extract_relations(generated_text_as_json, record["assertions"])
+                output_annotations.append(extracted_relations)
+            except Exception as e:
+                print(e)
+                print(f"Could not parse to JSON: {generated_text}")
+
+    if task == "assertions":
+        assertions_with_ids = []
+        for assertions, idx in zip(output_annotations, ids):
+            assertions_with_ids.append({"ids": idx, "assertions": assertions})
+        output_df = pd.DataFrame(assertions_with_ids)
+        output_df.to_json(output_data_path, orient="records", lines=True) # orient="index", indent=2)
+    elif task == "relations":
+        relations_with_ids = []
+        for relation, idx in zip(output_annotations, ids):
+            relations_with_ids.append({"ids": idx, "relations": relation})
+        output_df = pd.DataFrame(relations_with_ids)
+        output_df.to_json(output_data_path, orient="records", lines=True) # orient="index", indent=2)
+
+    print(f"Finished! The outputs are stored in {output_data_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str, choices=["assertions", "relations"], default="assertions")
+    parser.add_argument("--model_name_or_path", type=str, default="gpt-4o")
+    parser.add_argument("--input_data_path", type=str, default="data/selected_paper_abstracts.jsonl")
+    parser.add_argument("--output_data_path", type=str, default="data/generated_assertions.jsonl")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    generate_assertions_and_relations(args.task, args.input_data_path, args.output_data_path, args.model_name_or_path, args.debug)
diff --git a/src/prepare_data.py b/src/prepare_data.py
new file mode 100644
index 0000000..7114bf3
--- /dev/null
+++ b/src/prepare_data.py
@@ -0,0 +1,33 @@
+import argparse
+
+from acl_anthology import Anthology
+import pandas as pd
+
+
+def prepare_abstracts_from_acl(threshold: int, keyword_string: str, output_path: str):
+    # Instantiate the Anthology from the official repository
+    anthology = Anthology.from_repo()
+
+    selected_abstracts = []
+    selected_ids = []
+
+    for paper in anthology.papers():
+        if keyword_string in str(paper.title).lower():
+            if paper.abstract is None:
+                continue
+            #print(paper.full_id, paper.full_id, paper.title, paper.abstract)
+            selected_abstracts.append(str(paper.abstract))
+            selected_ids.append(paper.full_id)
+            if len(selected_abstracts) == threshold:
+                break
+
+    df = pd.DataFrame({"ids": selected_ids, "abstracts": selected_abstracts})
+    df.to_json(output_path, orient="records", lines=True)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--threshold", type=int, default=50)
+    parser.add_argument("--keyword_string", type=str, default="semantic parsing")
+    parser.add_argument("--output_path", type=str, default="data/selected_paper_abstracts.jsonl")
+    args = parser.parse_args()
+    prepare_abstracts_from_acl(args.threshold, args.keyword_string, args.output_path)
diff --git a/src/utils/relation_prompts.py b/src/utils/relation_prompts.py
new file mode 100644
index 0000000..9a372a6
--- /dev/null
+++ b/src/utils/relation_prompts.py
@@ -0,0 +1,52 @@
+relation2demo = {
+"Evidence":
+
+"""Definition: The satellite provides evidence supporting the nucleus.
+
+Example:
+
+Nucleus: She must be home.
+
+Satellite: The lights are on and her car is in the driveway.""",
+
+"Cause":
+
+"""Definition: The satellite gives the reason for the nucleus.
+
+Example:
+
+Nucleus: The streets are wet.
+
+Satellite: It rained last night.""",
+
+"Contrast":
+
+"""Definition: Two nuclei are compared to highlight differences.
+
+Example:
+
+Nucleus 1: John loves classical music.
+
+Nucleus 2: Mary prefers jazz.""",
+
+"Condition":
+
+"""Definition: The nucleus holds only if the satellite condition is true.
+
+Example:
+
+Nucleus: You can go out to play.
+
+Satellite: If you finish your homework first.""",
+
+"Background":
+
+"""Definition: The satellite provides background information to help interpret the nucleus.
+
+Example:
+
+Nucleus: She finally passed the exam.
+
+Satellite: She had failed it three times before.""",
+
+}