From e968869c47af52326f153bdfeed9872b401b3a92 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 17 Feb 2021 10:06:49 +0000 Subject: [PATCH] Add -c option to split-sentences.perl Some documents contain extremely long lines of generated text (most often links to search page results) that take forever to parse with the regular expressions in split-sentences.perl. Using the -c option these lines can be completely ignored. --- moses/ems/support/split-sentences.perl | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/moses/ems/support/split-sentences.perl b/moses/ems/support/split-sentences.perl index ae510ce..f6611bd 100755 --- a/moses/ems/support/split-sentences.perl +++ b/moses/ems/support/split-sentences.perl @@ -28,6 +28,7 @@ my $NOP = 0; my $KEEP_LINES = 0; my $MODE = "singledocument"; +my $LIMIT = 0; while (@ARGV) { $_ = shift; @@ -40,10 +41,11 @@ /^-k$/ && ($KEEP_LINES = 1, next); /^-b$/ && ($|++, next); # no output buffering /^-d$/ && ($MODE = "base64documents", next); + /^-c$/ && ($LIMIT = shift, next); } if ($HELP) { - print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] < textfile > splitfile\n"; + print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] [-c limit]< textfile > splitfile\n"; print "-q: quiet mode\n"; print "-b: no output buffering (for use in bidirectional pipes)\n"; print "-p: use a custom prefix file, overriding the installed one\n"; @@ -51,6 +53,7 @@ print "-n: do not emit

after paragraphs\n"; print "-k: keep existing line boundaries\n"; print "-d: work on multiple base64 encoded documents\n"; + print "-c: skip lines longer than char count entirely\n"; exit; } if (!$QUIET) { @@ -115,7 +118,10 @@ sub split_single_document { # Loop over text, add lines together until we get a blank line or a

while (<$fh>) { chomp; - if ($KEEP_LINES) { + if ($LIMIT > 0 && length() > $LIMIT) { + # Skip extremely long lines entirely. + next; + } elsif ($KEEP_LINES) { $out .= &split_block($_,""); } elsif (/^<.+>$/ || /^\s*$/) { # Time to process this block; we've hit a blank or

@@ -268,7 +274,9 @@ sub preprocess { # We stopped one token from the end to allow for easy look-ahead. # Append it now. - $text = $text.$words[$i]; + if (scalar(@words) > 0) { + $text = $text.$words[$i]; + } # Clean up spaces at head and tail of each line as well as any double-spacing $text =~ s/ +/ /g;