Skip to content

Commit 257ed47

Browse files
Peter JohnsonPeter Johnson
authored andcommitted
NLTK downloads added to dockerfile
1 parent 64d474d commit 257ed47

2 files changed

Lines changed: 11 additions & 6 deletions

File tree

Dockerfile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,18 @@ RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
2020
find /app/.venv -path "*/tests/*" -delete && \
2121
find /app/.venv -path "*/test/*" -delete && \
2222
find /app/.venv -name "*.md" -delete && \
23-
find /app/.venv -name "*.txt" -delete
23+
find /app/.venv -name "*.txt" -delete &&\
24+
# Download NLTK corpora into the venv so it's cached ---
25+
/app/.venv/bin/python -m nltk.downloader -d /app/.venv/nltk_data brown reuters gutenberg webtext
26+
27+
ENV NLTK_DATA=/app/.venv/nltk_data
2428

2529
FROM ghcr.io/lambda-feedback/evaluation-function-base/python:3.12
2630

2731
ENV VIRTUAL_ENV=/app/.venv \
2832
PATH="/app/.venv/bin:$PATH"
33+
ENV NLTK_DATA=/app/.venv/nltk_data
34+
2935

3036
# Copy the cleaned virtual environment
3137
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}

evaluation_function/models/shannon_words_ngram.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
"""
22
A simple n-gram (word) Shannon-style language model with add-one smoothing.
33
"""
4-
from lf_toolkit.evaluation import Result, Params
5-
import random, pickle, os
6-
import os
7-
import tempfile
4+
import sys, traceback, os
5+
import random, pickle, tempfile, re
86
from pathlib import Path
97
from io import StringIO
10-
import re
8+
from lf_toolkit.evaluation import Result, Params
119
from .utils import csv_to_lists
1210
import nltk
1311
from nltk.corpus import brown, reuters, gutenberg, webtext
1412

13+
# Local users run the following once (no need if using Docker):
1514
#nltk.download("brown"); nltk.download("reuters"); nltk.download("gutenberg"); nltk.download("webtext") # CHANGE (one-time)
1615

1716
START, END = "<s>", "</s>"

0 commit comments

Comments
 (0)