Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ VOLUME ["/job/data", "/job/src", "/job/work", "/job/output"]

# You should install any dependencies you need here.
# RUN pip install tqdm

RUN pip install nltk

RUN python -c "import nltk; nltk.download('gutenberg'); nltk.download('brown')"

CMD ["python", "model.py"]
6 changes: 6 additions & 0 deletions report.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Our Approach:
We decided that the most straightforward and effective method to solve the problem was with an n-gram character model with backoff, up to n = 5. N-gram models of orders 2 through 5 are built during training. Each model stores a mapping from a context window (the preceding n-1 characters) to a frequency count of all characters that followed it in the training data. At prediction time, the model attempts to match the longest possible context first (4 preceding characters for the 5-gram model). If that context was never seen during training, it backs off to a shorter context, trying 4-gram, then 3-gram, then bigram, and finally falling back to raw unigram character frequencies. This backoff strategy ensures the model always produces a prediction even for novel or unusual input sequences.

Our Data:
Training data was sourced from two NLTK corpora: the Gutenberg corpus, which contains classic English literary texts, and the Brown corpus, which contains a broad sample of American English across many genres. Together these provide several million characters of diverse English text. The nltk package is used to download and access the training corpora.
We initially intended to use publicly available transcripts from some previous NASA missions between astronauts and ground control. However, we realized this approach has a couple of issues. Firstly, the transcripts only add up to a few megabytes long, which is shorter than we would like for this type of task. Secondly and more importantly, based on the project description, our predictive model would be used for language more natural than we thought. Specifically, the NASA transcripts contain a massive number of repetitive phrases like “copy” and “maneuver”. In the end, we decided to just use a standard online English library, as it would cover the most test cases involving natural English speech.
200 changes: 174 additions & 26 deletions src/myprogram.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,210 @@
#!/usr/bin/env python
import os
import json
import string
import random
from collections import defaultdict
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter


class MyModel:
"""
This is a starter model to get you started. Feel free to modify this file.
Character-level n-gram model with backoff for next-character prediction.
Trains on NLTK corpora (Gutenberg + Brown) and uses n-grams from order 5
down to 1, backing off to shorter contexts when higher-order counts are
unavailable.
"""

MAX_N = 5 # highest n-gram order to build

def __init__(self):
# ngrams[n][context] = {char: count, ...}
# context is a string of length n-1
self.ngrams = {}
# unigram fallback: {char: count}
self.unigrams = defaultdict(int)

# ------------------------------------------------------------------
# Data loading
# ------------------------------------------------------------------

@classmethod
def load_training_data(cls):
# your code here
# this particular model doesn't train
return []
"""Return a single long string of training text from NLTK corpora."""
try:
import ssl
try:
ssl._create_default_https_context = ssl._create_unverified_context
except AttributeError:
pass

import nltk
corpora = ('gutenberg', 'brown', 'udhr', 'cess_esp', 'floresta', 'mac_morpho', 'indian', 'machado', 'sinica_treebank', 'jeita')
for corpus_id in corpora:
nltk.download(corpus_id, quiet=True)

text = ""
for corpus_id in corpora:
try:
corpus = getattr(nltk.corpus, corpus_id)
if hasattr(corpus, 'raw'):
if corpus_id == 'udhr':
for fileid in corpus.fileids():
text += corpus.raw(fileid)
else:
try:
text += corpus.raw()
except:
try:
for fileid in corpus.fileids():
text += corpus.raw(fileid)
except:
pass
except Exception as e:
pass

print(f'Loaded {len(text):,} characters from NLTK corpora')
return [text]
except Exception as e:
print(f'NLTK load failed ({e}), falling back to ascii printable chars as dummy corpus')
return []

@classmethod
def load_test_data(cls, fname):
# your code here
data = []
with open(fname) as f:
with open(fname, encoding='utf-8') as f:
for line in f:
inp = line[:-1] # the last character is a newline
inp = line[:-1] # strip trailing newline
data.append(inp)
return data

@classmethod
def write_pred(cls, preds, fname):
with open(fname, 'wt') as f:
with open(fname, 'wt', encoding='utf-8') as f:
for p in preds:
f.write('{}\n'.format(p))

# ------------------------------------------------------------------
# Training
# ------------------------------------------------------------------

def run_train(self, data, work_dir):
# your code here
pass
"""
Build n-gram frequency tables from the training corpus.
data is a list of strings (usually just one large string).
"""
# Initialise tables for orders 2..MAX_N (order 1 = unigram handled separately)
for n in range(2, self.MAX_N + 1):
self.ngrams[n] = defaultdict(lambda: defaultdict(int))

total_chars = 0
for text in data:
for i, ch in enumerate(text):
self.unigrams[ch] += 1
total_chars += 1
# For each n-gram order, record context → next char
for n in range(2, self.MAX_N + 1):
if i >= n - 1:
context = text[i - (n - 1): i]
self.ngrams[n][context][ch] += 1

print(f'Trained on {total_chars:,} characters')
print(f'Unigram vocab size: {len(self.unigrams)}')
for n in range(2, self.MAX_N + 1):
print(f' {n}-gram contexts: {len(self.ngrams[n]):,}')

# ------------------------------------------------------------------
# Prediction
# ------------------------------------------------------------------

def predict_next(self, context):
"""
Predict the 3 most likely next characters after `context`.
Uses backoff from MAX_N down to unigram.
"""
guesses = []

# Try from longest context down to bigram
for n in range(self.MAX_N, 1, -1):
if len(context) >= n - 1:
ctx = context[-(n - 1):]
if ctx in self.ngrams.get(n, {}):
candidates = sorted(self.ngrams[n][ctx].items(), key=lambda x: -x[1])
for ch, _ in candidates:
if ch not in guesses:
guesses.append(ch)
if len(guesses) >= 3:
return guesses

# Unigram fallback
if self.unigrams:
candidates = sorted(self.unigrams.items(), key=lambda x: -x[1])
for ch, _ in candidates:
if ch not in guesses:
guesses.append(ch)
if len(guesses) >= 3:
return guesses

# Last resort: random printable ASCII
for ch in string.ascii_letters:
if ch not in guesses:
guesses.append(ch)
if len(guesses) >= 3:
return guesses

return guesses

def run_pred(self, data):
# your code here
preds = []
all_chars = string.ascii_letters
for inp in data:
# this model just predicts a random character each time
top_guesses = [random.choice(all_chars) for _ in range(3)]
preds.append(''.join(top_guesses))
top_guesses = self.predict_next(inp)
# Pad to exactly 3 guesses if needed
while len(top_guesses) < 3:
candidate = random.choice(string.ascii_letters)
if candidate not in top_guesses:
top_guesses.append(candidate)
preds.append(''.join(top_guesses[:3]))
return preds

# ------------------------------------------------------------------
# Save / Load
# ------------------------------------------------------------------

def save(self, work_dir):
# your code here
# this particular model has nothing to save, but for demonstration purposes we will save a blank file
with open(os.path.join(work_dir, 'model.checkpoint'), 'wt') as f:
f.write('dummy save')
checkpoint = {
'unigrams': dict(self.unigrams),
'ngrams': {
str(n): {ctx: dict(chars) for ctx, chars in table.items()}
for n, table in self.ngrams.items()
}
}
path = os.path.join(work_dir, 'model.checkpoint')
with open(path, 'wt', encoding='utf-8') as f:
json.dump(checkpoint, f, ensure_ascii=False)
print(f'Model saved to {path}')

@classmethod
def load(cls, work_dir):
# your code here
# this particular model has nothing to load, but for demonstration purposes we will load a blank file
with open(os.path.join(work_dir, 'model.checkpoint')) as f:
dummy_save = f.read()
return MyModel()
path = os.path.join(work_dir, 'model.checkpoint')
with open(path, encoding='utf-8') as f:
checkpoint = json.load(f)

model = cls()
model.unigrams = defaultdict(int, checkpoint['unigrams'])
model.ngrams = {}
for n_str, table in checkpoint['ngrams'].items():
n = int(n_str)
model.ngrams[n] = defaultdict(lambda: defaultdict(int))
for ctx, chars in table.items():
model.ngrams[n][ctx] = defaultdict(int, chars)

print(f'Model loaded from {path}')
return model


# ----------------------------------------------------------------------
# Entry point
# ----------------------------------------------------------------------

if __name__ == '__main__':
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
Expand All @@ -75,15 +220,17 @@ def load(cls, work_dir):
if not os.path.isdir(args.work_dir):
print('Making working directory {}'.format(args.work_dir))
os.makedirs(args.work_dir)
print('Instatiating model')
print('Instantiating model')
model = MyModel()
print('Loading training data')
train_data = MyModel.load_training_data()
print('Training')
model.run_train(train_data, args.work_dir)
print('Saving model')
model.save(args.work_dir)

elif args.mode == 'test':

print('Loading model')
model = MyModel.load(args.work_dir)
print('Loading test data from {}'.format(args.test_data))
Expand All @@ -93,5 +240,6 @@ def load(cls, work_dir):
print('Writing predictions to {}'.format(args.test_output))
assert len(pred) == len(test_data), 'Expected {} predictions but got {}'.format(len(test_data), len(pred))
model.write_pred(pred, args.test_output)

else:
raise NotImplementedError('Unknown mode {}'.format(args.mode))
raise NotImplementedError('Unknown mode {}'.format(args.mode))
3 changes: 3 additions & 0 deletions team.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Saumitra Joshi,sjoshi6
Armand Meyer,armand03
Aryan Kedarisetty,aryankk