From adba552cae230186549d82cebff2ecba012ab437 Mon Sep 17 00:00:00 2001 From: Saumitra404 Date: Tue, 27 Jan 2026 16:05:32 -0800 Subject: [PATCH 1/6] Add team member details to team.txt --- team.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 team.txt diff --git a/team.txt b/team.txt new file mode 100644 index 000000000..548c2faee --- /dev/null +++ b/team.txt @@ -0,0 +1,3 @@ +Saumitra Joshi,sjoshi6 +Armand Meyer,armand03 +Aryan Kedarisetty,aryankk From 772640fffe0237592e6232246390f8b375097960 Mon Sep 17 00:00:00 2001 From: ArmandMeyer Date: Thu, 12 Mar 2026 12:56:59 -0700 Subject: [PATCH 2/6] Added n-gram model --- src/myprogram.py | 160 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 136 insertions(+), 24 deletions(-) diff --git a/src/myprogram.py b/src/myprogram.py index 23488fd30..30baa699e 100644 --- a/src/myprogram.py +++ b/src/myprogram.py @@ -1,28 +1,56 @@ #!/usr/bin/env python import os +import json import string import random +from collections import defaultdict from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter class MyModel: """ - This is a starter model to get you started. Feel free to modify this file. + Character-level n-gram model with backoff for next-character prediction. + Trains on NLTK corpora (Gutenberg + Brown) and uses n-grams from order 5 + down to 1, backing off to shorter contexts when higher-order counts are + unavailable. """ + MAX_N = 5 # highest n-gram order to build + + def __init__(self): + # ngrams[n][context] = {char: count, ...} + # context is a string of length n-1 + self.ngrams = {} + # unigram fallback: {char: count} + self.unigrams = defaultdict(int) + + # ------------------------------------------------------------------ + # Data loading + # ------------------------------------------------------------------ + @classmethod def load_training_data(cls): - # your code here - # this particular model doesn't train - return [] + """Return a single long string of training text from NLTK corpora.""" + try: + import nltk + # Download corpora quietly if not already present + for corpus_id in ('gutenberg', 'brown'): + nltk.download(corpus_id, quiet=True) + + from nltk.corpus import gutenberg, brown + text = gutenberg.raw() + brown.raw() + print(f'Loaded {len(text):,} characters from NLTK corpora') + return [text] + except Exception as e: + print(f'NLTK load failed ({e}), falling back to ascii printable chars as dummy corpus') + return [] @classmethod def load_test_data(cls, fname): - # your code here data = [] with open(fname) as f: for line in f: - inp = line[:-1] # the last character is a newline + inp = line[:-1] # strip trailing newline data.append(inp) return data @@ -32,34 +60,115 @@ def write_pred(cls, preds, fname): for p in preds: f.write('{}\n'.format(p)) + # ------------------------------------------------------------------ + # Training + # ------------------------------------------------------------------ + def run_train(self, data, work_dir): - # your code here - pass + """ + Build n-gram frequency tables from the training corpus. + data is a list of strings (usually just one large string). + """ + # Initialise tables for orders 2..MAX_N (order 1 = unigram handled separately) + for n in range(2, self.MAX_N + 1): + self.ngrams[n] = defaultdict(lambda: defaultdict(int)) + + total_chars = 0 + for text in data: + for i, ch in enumerate(text): + self.unigrams[ch] += 1 + total_chars += 1 + # For each n-gram order, record context → next char + for n in range(2, self.MAX_N + 1): + if i >= n - 1: + context = text[i - (n - 1): i] + self.ngrams[n][context][ch] += 1 + + print(f'Trained on {total_chars:,} characters') + print(f'Unigram vocab size: {len(self.unigrams)}') + for n in range(2, self.MAX_N + 1): + print(f' {n}-gram contexts: {len(self.ngrams[n]):,}') + + # ------------------------------------------------------------------ + # Prediction + # ------------------------------------------------------------------ + + def _top3(self, counter): + """Return the top-3 characters from a {char: count} dict.""" + if not counter: + return [] + return [ch for ch, _ in sorted(counter.items(), key=lambda x: -x[1])[:3]] + + def predict_next(self, context): + """ + Predict the 3 most likely next characters after `context`. + Uses backoff from MAX_N down to unigram. + """ + # Try from longest context down to bigram + for n in range(self.MAX_N, 1, -1): + if len(context) >= n - 1: + ctx = context[-(n - 1):] + if ctx in self.ngrams.get(n, {}): + top = self._top3(self.ngrams[n][ctx]) + if top: + return top + + # Unigram fallback + if self.unigrams: + return self._top3(self.unigrams) + + # Last resort: random printable ASCII + return random.sample(string.ascii_letters, 3) def run_pred(self, data): - # your code here preds = [] - all_chars = string.ascii_letters for inp in data: - # this model just predicts a random character each time - top_guesses = [random.choice(all_chars) for _ in range(3)] - preds.append(''.join(top_guesses)) + top_guesses = self.predict_next(inp) + # Pad to exactly 3 guesses if needed + while len(top_guesses) < 3: + top_guesses.append(random.choice(string.ascii_letters)) + preds.append(''.join(top_guesses[:3])) return preds + # ------------------------------------------------------------------ + # Save / Load + # ------------------------------------------------------------------ + def save(self, work_dir): - # your code here - # this particular model has nothing to save, but for demonstration purposes we will save a blank file - with open(os.path.join(work_dir, 'model.checkpoint'), 'wt') as f: - f.write('dummy save') + checkpoint = { + 'unigrams': dict(self.unigrams), + 'ngrams': { + str(n): {ctx: dict(chars) for ctx, chars in table.items()} + for n, table in self.ngrams.items() + } + } + path = os.path.join(work_dir, 'model.checkpoint') + with open(path, 'wt', encoding='utf-8') as f: + json.dump(checkpoint, f, ensure_ascii=False) + print(f'Model saved to {path}') @classmethod def load(cls, work_dir): - # your code here - # this particular model has nothing to load, but for demonstration purposes we will load a blank file - with open(os.path.join(work_dir, 'model.checkpoint')) as f: - dummy_save = f.read() - return MyModel() + path = os.path.join(work_dir, 'model.checkpoint') + with open(path, encoding='utf-8') as f: + checkpoint = json.load(f) + + model = cls() + model.unigrams = defaultdict(int, checkpoint['unigrams']) + model.ngrams = {} + for n_str, table in checkpoint['ngrams'].items(): + n = int(n_str) + model.ngrams[n] = defaultdict(lambda: defaultdict(int)) + for ctx, chars in table.items(): + model.ngrams[n][ctx] = defaultdict(int, chars) + print(f'Model loaded from {path}') + return model + + +# ---------------------------------------------------------------------- +# Entry point +# ---------------------------------------------------------------------- if __name__ == '__main__': parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) @@ -75,7 +184,7 @@ def load(cls, work_dir): if not os.path.isdir(args.work_dir): print('Making working directory {}'.format(args.work_dir)) os.makedirs(args.work_dir) - print('Instatiating model') + print('Instantiating model') model = MyModel() print('Loading training data') train_data = MyModel.load_training_data() @@ -83,7 +192,9 @@ def load(cls, work_dir): model.run_train(train_data, args.work_dir) print('Saving model') model.save(args.work_dir) + elif args.mode == 'test': + print('Loading model') model = MyModel.load(args.work_dir) print('Loading test data from {}'.format(args.test_data)) @@ -93,5 +204,6 @@ def load(cls, work_dir): print('Writing predictions to {}'.format(args.test_output)) assert len(pred) == len(test_data), 'Expected {} predictions but got {}'.format(len(test_data), len(pred)) model.write_pred(pred, args.test_output) + else: - raise NotImplementedError('Unknown mode {}'.format(args.mode)) + raise NotImplementedError('Unknown mode {}'.format(args.mode)) \ No newline at end of file From 6fd5f38b2cf1e89414a2cf9b01aa65ffbdb8d993 Mon Sep 17 00:00:00 2001 From: ArmandMeyer Date: Fri, 13 Mar 2026 11:30:09 -0700 Subject: [PATCH 3/6] added text report --- report.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 report.txt diff --git a/report.txt b/report.txt new file mode 100644 index 000000000..796b3c34c --- /dev/null +++ b/report.txt @@ -0,0 +1,6 @@ +Our Approach: + We decided that the most straightforward and effective method to solve the problem was with an n-gram character model with backoff, up to n = 5. N-gram models of orders 2 through 5 are built during training. Each model stores a mapping from a context window (the preceding n-1 characters) to a frequency count of all characters that followed it in the training data. At prediction time, the model attempts to match the longest possible context first (4 preceding characters for the 5-gram model). If that context was never seen during training, it backs off to a shorter context, trying 4-gram, then 3-gram, then bigram, and finally falling back to raw unigram character frequencies. This backoff strategy ensures the model always produces a prediction even for novel or unusual input sequences. + +Our Data: + Training data was sourced from two NLTK corpora: the Gutenberg corpus, which contains classic English literary texts, and the Brown corpus, which contains a broad sample of American English across many genres. Together these provide several million characters of diverse English text. The nltk package is used to download and access the training corpora. + We initially intended to use publicly available transcripts from some previous NASA missions between astronauts and ground control. However, we realized this approach has a couple of issues. Firstly, the transcripts only add up to a few megabytes long, which is shorter than we would like for this type of task. Secondly and more importantly, based on the project description, our predictive model would be used for language more natural than we thought. Specifically, the NASA transcripts contain a massive number of repetitive phrases like “copy” and “maneuver”. In the end, we decided to just use a standard online English library, as it would cover the most test cases involving natural English speech. From 94d2ce205a9552b4caf7d372273411755172f7ac Mon Sep 17 00:00:00 2001 From: Saumitra Joshi Date: Fri, 13 Mar 2026 18:51:34 -0700 Subject: [PATCH 4/6] Updated myprogram to 1. Include more nltk corpora (added multilingual corpora) 2. Added fallback when there's only one character for a prediction (instead of the 2nd and 3rd guess being random, they're now based on lower n-grams) 3. Added a bypass for SSL verification (so I could run it on my mac) --- src/myprogram.py | 68 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/src/myprogram.py b/src/myprogram.py index 30baa699e..779071ab8 100644 --- a/src/myprogram.py +++ b/src/myprogram.py @@ -32,13 +32,37 @@ def __init__(self): def load_training_data(cls): """Return a single long string of training text from NLTK corpora.""" try: + import ssl + try: + ssl._create_default_https_context = ssl._create_unverified_context + except AttributeError: + pass + import nltk - # Download corpora quietly if not already present - for corpus_id in ('gutenberg', 'brown'): + corpora = ('gutenberg', 'brown', 'udhr', 'cess_esp', 'floresta', 'mac_morpho', 'indian', 'machado') + for corpus_id in corpora: nltk.download(corpus_id, quiet=True) - from nltk.corpus import gutenberg, brown - text = gutenberg.raw() + brown.raw() + text = "" + for corpus_id in corpora: + try: + corpus = getattr(nltk.corpus, corpus_id) + if hasattr(corpus, 'raw'): + if corpus_id == 'udhr': + for fileid in corpus.fileids(): + text += corpus.raw(fileid) + else: + try: + text += corpus.raw() + except: + try: + for fileid in corpus.fileids(): + text += corpus.raw(fileid) + except: + pass + except Exception as e: + pass + print(f'Loaded {len(text):,} characters from NLTK corpora') return [text] except Exception as e: @@ -93,32 +117,42 @@ def run_train(self, data, work_dir): # Prediction # ------------------------------------------------------------------ - def _top3(self, counter): - """Return the top-3 characters from a {char: count} dict.""" - if not counter: - return [] - return [ch for ch, _ in sorted(counter.items(), key=lambda x: -x[1])[:3]] - def predict_next(self, context): """ Predict the 3 most likely next characters after `context`. Uses backoff from MAX_N down to unigram. """ + guesses = [] + # Try from longest context down to bigram for n in range(self.MAX_N, 1, -1): if len(context) >= n - 1: ctx = context[-(n - 1):] if ctx in self.ngrams.get(n, {}): - top = self._top3(self.ngrams[n][ctx]) - if top: - return top + candidates = sorted(self.ngrams[n][ctx].items(), key=lambda x: -x[1]) + for ch, _ in candidates: + if ch not in guesses: + guesses.append(ch) + if len(guesses) >= 3: + return guesses # Unigram fallback if self.unigrams: - return self._top3(self.unigrams) + candidates = sorted(self.unigrams.items(), key=lambda x: -x[1]) + for ch, _ in candidates: + if ch not in guesses: + guesses.append(ch) + if len(guesses) >= 3: + return guesses # Last resort: random printable ASCII - return random.sample(string.ascii_letters, 3) + for ch in string.ascii_letters: + if ch not in guesses: + guesses.append(ch) + if len(guesses) >= 3: + return guesses + + return guesses def run_pred(self, data): preds = [] @@ -126,7 +160,9 @@ def run_pred(self, data): top_guesses = self.predict_next(inp) # Pad to exactly 3 guesses if needed while len(top_guesses) < 3: - top_guesses.append(random.choice(string.ascii_letters)) + candidate = random.choice(string.ascii_letters) + if candidate not in top_guesses: + top_guesses.append(candidate) preds.append(''.join(top_guesses[:3])) return preds From fd49c8bcdbfa6fc3c4e37af7cc952e099fd9ad52 Mon Sep 17 00:00:00 2001 From: ArmandMeyer Date: Fri, 13 Mar 2026 21:09:52 -0700 Subject: [PATCH 5/6] Docker works with dependencies. Added Chinese + Japanese corpora --- Dockerfile | 6 ++++++ src/myprogram.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 816fd3063..c907c9d08 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,3 +5,9 @@ VOLUME ["/job/data", "/job/src", "/job/work", "/job/output"] # You should install any dependencies you need here. # RUN pip install tqdm + +RUN pip install nltk + +RUN python -c "import nltk; nltk.download('gutenberg'); nltk.download('brown')" + +CMD ["python", "model.py"] \ No newline at end of file diff --git a/src/myprogram.py b/src/myprogram.py index 779071ab8..4c74520f0 100644 --- a/src/myprogram.py +++ b/src/myprogram.py @@ -39,7 +39,7 @@ def load_training_data(cls): pass import nltk - corpora = ('gutenberg', 'brown', 'udhr', 'cess_esp', 'floresta', 'mac_morpho', 'indian', 'machado') + corpora = ('gutenberg', 'brown', 'udhr', 'cess_esp', 'floresta', 'mac_morpho', 'indian', 'machado', 'sinica_treebank', 'jeita') for corpus_id in corpora: nltk.download(corpus_id, quiet=True) From 9782d7d2cf83e8e5251c709fa678d717fc2ff06d Mon Sep 17 00:00:00 2001 From: ArmandMeyer Date: Fri, 13 Mar 2026 21:57:12 -0700 Subject: [PATCH 6/6] addet utf-8 encoding --- src/myprogram.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/myprogram.py b/src/myprogram.py index 4c74520f0..880f75c64 100644 --- a/src/myprogram.py +++ b/src/myprogram.py @@ -72,7 +72,7 @@ def load_training_data(cls): @classmethod def load_test_data(cls, fname): data = [] - with open(fname) as f: + with open(fname, encoding='utf-8') as f: for line in f: inp = line[:-1] # strip trailing newline data.append(inp) @@ -80,7 +80,7 @@ def load_test_data(cls, fname): @classmethod def write_pred(cls, preds, fname): - with open(fname, 'wt') as f: + with open(fname, 'wt', encoding='utf-8') as f: for p in preds: f.write('{}\n'.format(p))