From e968869c47af52326f153bdfeed9872b401b3a92 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 17 Feb 2021 10:06:49 +0000
Subject: [PATCH] Add -c option to split-sentences.perl

Some documents contain extremely long lines of generated text (most often links to search page results) that take forever to parse with the regular expressions in split-sentences.perl. Using the -c option these lines can be completely ignored.
---
 moses/ems/support/split-sentences.perl | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/moses/ems/support/split-sentences.perl b/moses/ems/support/split-sentences.perl
index ae510ce..f6611bd 100755
--- a/moses/ems/support/split-sentences.perl
+++ b/moses/ems/support/split-sentences.perl
@@ -28,6 +28,7 @@
 my $NOP = 0;
 my $KEEP_LINES = 0;
 my $MODE = "singledocument";
+my $LIMIT = 0;
 
 while (@ARGV) {
 	$_ = shift;
@@ -40,10 +41,11 @@
 	/^-k$/ && ($KEEP_LINES = 1, next);
 	/^-b$/ && ($|++, next); # no output buffering
 	/^-d$/ && ($MODE = "base64documents", next);
+	/^-c$/ && ($LIMIT = shift, next);
 }
 
 if ($HELP) {
-	print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] < textfile > splitfile\n";
+	print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] [-c limit]< textfile > splitfile\n";
 	print "-q: quiet mode\n";
 	print "-b: no output buffering (for use in bidirectional pipes)\n";
 	print "-p: use a custom prefix file, overriding the installed one\n";
@@ -51,6 +53,7 @@
 	print "-n: do not emit <P> after paragraphs\n";
 	print "-k: keep existing line boundaries\n";
 	print "-d: work on multiple base64 encoded documents\n";
+	print "-c: skip lines longer than char count entirely\n";
 	exit;
 }
 if (!$QUIET) {
@@ -115,7 +118,10 @@ sub split_single_document {
 	# Loop over text, add lines together until we get a blank line or a <p>
 	while (<$fh>) {
 		chomp;
-		if ($KEEP_LINES) {
+		if ($LIMIT > 0 && length() > $LIMIT) {
+			# Skip extremely long lines entirely.
+			next;
+		} elsif ($KEEP_LINES) {
 			$out .= &split_block($_,"");
 		} elsif (/^<.+>$/ || /^\s*$/) {
 			# Time to process this block; we've hit a blank or <p>
@@ -268,7 +274,9 @@ sub preprocess {
 
 	# We stopped one token from the end to allow for easy look-ahead.
 	# Append it now.
-	$text = $text.$words[$i];
+	if (scalar(@words) > 0) {
+		$text = $text.$words[$i];
+	}
 
 	# Clean up spaces at head and tail of each line as well as any double-spacing
 	$text =~ s/ +/ /g;