diff --git a/Dockerfile b/Dockerfile index 816fd3063..c907c9d08 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,3 +5,9 @@ VOLUME ["/job/data", "/job/src", "/job/work", "/job/output"] # You should install any dependencies you need here. # RUN pip install tqdm + +RUN pip install nltk + +RUN python -c "import nltk; nltk.download('gutenberg'); nltk.download('brown')" + +CMD ["python", "model.py"] \ No newline at end of file diff --git a/report.txt b/report.txt new file mode 100644 index 000000000..796b3c34c --- /dev/null +++ b/report.txt @@ -0,0 +1,6 @@ +Our Approach: + We decided that the most straightforward and effective method to solve the problem was with an n-gram character model with backoff, up to n = 5. N-gram models of orders 2 through 5 are built during training. Each model stores a mapping from a context window (the preceding n-1 characters) to a frequency count of all characters that followed it in the training data. At prediction time, the model attempts to match the longest possible context first (4 preceding characters for the 5-gram model). If that context was never seen during training, it backs off to a shorter context, trying 4-gram, then 3-gram, then bigram, and finally falling back to raw unigram character frequencies. This backoff strategy ensures the model always produces a prediction even for novel or unusual input sequences. + +Our Data: + Training data was sourced from two NLTK corpora: the Gutenberg corpus, which contains classic English literary texts, and the Brown corpus, which contains a broad sample of American English across many genres. Together these provide several million characters of diverse English text. The nltk package is used to download and access the training corpora. + We initially intended to use publicly available transcripts from some previous NASA missions between astronauts and ground control. However, we realized this approach has a couple of issues. Firstly, the transcripts only add up to a few megabytes long, which is shorter than we would like for this type of task. Secondly and more importantly, based on the project description, our predictive model would be used for language more natural than we thought. Specifically, the NASA transcripts contain a massive number of repetitive phrases like “copy” and “maneuver”. In the end, we decided to just use a standard online English library, as it would cover the most test cases involving natural English speech. diff --git a/src/myprogram.py b/src/myprogram.py index 23488fd30..880f75c64 100644 --- a/src/myprogram.py +++ b/src/myprogram.py @@ -1,65 +1,210 @@ #!/usr/bin/env python import os +import json import string import random +from collections import defaultdict from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter class MyModel: """ - This is a starter model to get you started. Feel free to modify this file. + Character-level n-gram model with backoff for next-character prediction. + Trains on NLTK corpora (Gutenberg + Brown) and uses n-grams from order 5 + down to 1, backing off to shorter contexts when higher-order counts are + unavailable. """ + MAX_N = 5 # highest n-gram order to build + + def __init__(self): + # ngrams[n][context] = {char: count, ...} + # context is a string of length n-1 + self.ngrams = {} + # unigram fallback: {char: count} + self.unigrams = defaultdict(int) + + # ------------------------------------------------------------------ + # Data loading + # ------------------------------------------------------------------ + @classmethod def load_training_data(cls): - # your code here - # this particular model doesn't train - return [] + """Return a single long string of training text from NLTK corpora.""" + try: + import ssl + try: + ssl._create_default_https_context = ssl._create_unverified_context + except AttributeError: + pass + + import nltk + corpora = ('gutenberg', 'brown', 'udhr', 'cess_esp', 'floresta', 'mac_morpho', 'indian', 'machado', 'sinica_treebank', 'jeita') + for corpus_id in corpora: + nltk.download(corpus_id, quiet=True) + + text = "" + for corpus_id in corpora: + try: + corpus = getattr(nltk.corpus, corpus_id) + if hasattr(corpus, 'raw'): + if corpus_id == 'udhr': + for fileid in corpus.fileids(): + text += corpus.raw(fileid) + else: + try: + text += corpus.raw() + except: + try: + for fileid in corpus.fileids(): + text += corpus.raw(fileid) + except: + pass + except Exception as e: + pass + + print(f'Loaded {len(text):,} characters from NLTK corpora') + return [text] + except Exception as e: + print(f'NLTK load failed ({e}), falling back to ascii printable chars as dummy corpus') + return [] @classmethod def load_test_data(cls, fname): - # your code here data = [] - with open(fname) as f: + with open(fname, encoding='utf-8') as f: for line in f: - inp = line[:-1] # the last character is a newline + inp = line[:-1] # strip trailing newline data.append(inp) return data @classmethod def write_pred(cls, preds, fname): - with open(fname, 'wt') as f: + with open(fname, 'wt', encoding='utf-8') as f: for p in preds: f.write('{}\n'.format(p)) + # ------------------------------------------------------------------ + # Training + # ------------------------------------------------------------------ + def run_train(self, data, work_dir): - # your code here - pass + """ + Build n-gram frequency tables from the training corpus. + data is a list of strings (usually just one large string). + """ + # Initialise tables for orders 2..MAX_N (order 1 = unigram handled separately) + for n in range(2, self.MAX_N + 1): + self.ngrams[n] = defaultdict(lambda: defaultdict(int)) + + total_chars = 0 + for text in data: + for i, ch in enumerate(text): + self.unigrams[ch] += 1 + total_chars += 1 + # For each n-gram order, record context → next char + for n in range(2, self.MAX_N + 1): + if i >= n - 1: + context = text[i - (n - 1): i] + self.ngrams[n][context][ch] += 1 + + print(f'Trained on {total_chars:,} characters') + print(f'Unigram vocab size: {len(self.unigrams)}') + for n in range(2, self.MAX_N + 1): + print(f' {n}-gram contexts: {len(self.ngrams[n]):,}') + + # ------------------------------------------------------------------ + # Prediction + # ------------------------------------------------------------------ + + def predict_next(self, context): + """ + Predict the 3 most likely next characters after `context`. + Uses backoff from MAX_N down to unigram. + """ + guesses = [] + + # Try from longest context down to bigram + for n in range(self.MAX_N, 1, -1): + if len(context) >= n - 1: + ctx = context[-(n - 1):] + if ctx in self.ngrams.get(n, {}): + candidates = sorted(self.ngrams[n][ctx].items(), key=lambda x: -x[1]) + for ch, _ in candidates: + if ch not in guesses: + guesses.append(ch) + if len(guesses) >= 3: + return guesses + + # Unigram fallback + if self.unigrams: + candidates = sorted(self.unigrams.items(), key=lambda x: -x[1]) + for ch, _ in candidates: + if ch not in guesses: + guesses.append(ch) + if len(guesses) >= 3: + return guesses + + # Last resort: random printable ASCII + for ch in string.ascii_letters: + if ch not in guesses: + guesses.append(ch) + if len(guesses) >= 3: + return guesses + + return guesses def run_pred(self, data): - # your code here preds = [] - all_chars = string.ascii_letters for inp in data: - # this model just predicts a random character each time - top_guesses = [random.choice(all_chars) for _ in range(3)] - preds.append(''.join(top_guesses)) + top_guesses = self.predict_next(inp) + # Pad to exactly 3 guesses if needed + while len(top_guesses) < 3: + candidate = random.choice(string.ascii_letters) + if candidate not in top_guesses: + top_guesses.append(candidate) + preds.append(''.join(top_guesses[:3])) return preds + # ------------------------------------------------------------------ + # Save / Load + # ------------------------------------------------------------------ + def save(self, work_dir): - # your code here - # this particular model has nothing to save, but for demonstration purposes we will save a blank file - with open(os.path.join(work_dir, 'model.checkpoint'), 'wt') as f: - f.write('dummy save') + checkpoint = { + 'unigrams': dict(self.unigrams), + 'ngrams': { + str(n): {ctx: dict(chars) for ctx, chars in table.items()} + for n, table in self.ngrams.items() + } + } + path = os.path.join(work_dir, 'model.checkpoint') + with open(path, 'wt', encoding='utf-8') as f: + json.dump(checkpoint, f, ensure_ascii=False) + print(f'Model saved to {path}') @classmethod def load(cls, work_dir): - # your code here - # this particular model has nothing to load, but for demonstration purposes we will load a blank file - with open(os.path.join(work_dir, 'model.checkpoint')) as f: - dummy_save = f.read() - return MyModel() + path = os.path.join(work_dir, 'model.checkpoint') + with open(path, encoding='utf-8') as f: + checkpoint = json.load(f) + + model = cls() + model.unigrams = defaultdict(int, checkpoint['unigrams']) + model.ngrams = {} + for n_str, table in checkpoint['ngrams'].items(): + n = int(n_str) + model.ngrams[n] = defaultdict(lambda: defaultdict(int)) + for ctx, chars in table.items(): + model.ngrams[n][ctx] = defaultdict(int, chars) + print(f'Model loaded from {path}') + return model + + +# ---------------------------------------------------------------------- +# Entry point +# ---------------------------------------------------------------------- if __name__ == '__main__': parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) @@ -75,7 +220,7 @@ def load(cls, work_dir): if not os.path.isdir(args.work_dir): print('Making working directory {}'.format(args.work_dir)) os.makedirs(args.work_dir) - print('Instatiating model') + print('Instantiating model') model = MyModel() print('Loading training data') train_data = MyModel.load_training_data() @@ -83,7 +228,9 @@ def load(cls, work_dir): model.run_train(train_data, args.work_dir) print('Saving model') model.save(args.work_dir) + elif args.mode == 'test': + print('Loading model') model = MyModel.load(args.work_dir) print('Loading test data from {}'.format(args.test_data)) @@ -93,5 +240,6 @@ def load(cls, work_dir): print('Writing predictions to {}'.format(args.test_output)) assert len(pred) == len(test_data), 'Expected {} predictions but got {}'.format(len(test_data), len(pred)) model.write_pred(pred, args.test_output) + else: - raise NotImplementedError('Unknown mode {}'.format(args.mode)) + raise NotImplementedError('Unknown mode {}'.format(args.mode)) \ No newline at end of file diff --git a/team.txt b/team.txt new file mode 100644 index 000000000..548c2faee --- /dev/null +++ b/team.txt @@ -0,0 +1,3 @@ +Saumitra Joshi,sjoshi6 +Armand Meyer,armand03 +Aryan Kedarisetty,aryankk