Skip to content

Commit beb5949

Browse files
authored
Merge pull request #50 from alpha-canada-ca/fix/profanity-filter
fix/profanity-filter
2 parents 4ac3c18 + 93f0e9e commit beb5949

1 file changed

Lines changed: 41 additions & 16 deletions

File tree

src/main/java/ca/gc/tbs/service/BadWords.java

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import ca.gc.tbs.repository.BadWordEntryRepository;
55
import java.util.*;
66
import java.util.concurrent.ConcurrentHashMap;
7+
import java.util.regex.Matcher;
8+
import java.util.regex.Pattern;
9+
import java.util.stream.Collectors;
710
import javax.annotation.PostConstruct;
811
import org.slf4j.Logger;
912
import org.slf4j.LoggerFactory;
@@ -30,6 +33,8 @@ public class BadWords {
3033
// Combined set of all words to filter (profanity + threats)
3134
private final Set<String> allFilterWords = Collections.newSetFromMap(new ConcurrentHashMap<>());
3235

36+
private Pattern filterPattern;
37+
3338
@Autowired
3439
public BadWords(BadWordEntryRepository badWordEntryRepository) {
3540
this.badWordEntryRepository = badWordEntryRepository;
@@ -53,6 +58,7 @@ public void loadConfigs() {
5358
String word = entry.getWord().trim().toLowerCase();
5459
profanityWords.add(word);
5560
allFilterWords.add(word);
61+
compileFilterPattern();
5662
});
5763

5864
// Load threat words
@@ -138,33 +144,51 @@ public Set<String> getThreatWords() {
138144
return Collections.unmodifiableSet(threatWords);
139145
}
140146

147+
148+
private void compileFilterPattern() {
149+
if (allFilterWords.isEmpty()) {
150+
filterPattern = null;
151+
return;
152+
}
153+
String patternString = allFilterWords.stream()
154+
.filter(word -> word != null && !word.trim().isEmpty())
155+
.map(Pattern::quote)
156+
.map(word -> "\\b" + word + "\\b") // exact whole word only
157+
.collect(Collectors.joining("|"));
158+
filterPattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
159+
}
160+
141161
/**
142162
* Censors profanity and threats in the given text by replacing them with asterisks.
143163
* Words in the allowed words list are never censored.
144-
*
164+
*
145165
* @param text The text to censor
146166
* @return The censored text
147167
*/
148168
public String censor(String text) {
149169
if (text == null || text.isEmpty()) {
150170
return text;
151171
}
152-
153-
StringBuilder result = new StringBuilder();
154-
for (String word : text.split("\\s+")) {
155-
String wordToCheck =
156-
word.toLowerCase()
157-
.replaceAll("[^a-zà-ÿ]", ""); // Including accented characters for French
158-
159-
// Skip censoring if the word is in the allowed words list
160-
boolean shouldCensor = allFilterWords.contains(wordToCheck) &&
161-
!allowedWords.contains(wordToCheck);
162-
163-
result
164-
.append(shouldCensor ? createMask(word) : word)
165-
.append(' ');
172+
if (filterPattern == null) {
173+
// No filter words loaded
174+
return text;
175+
}
176+
177+
Matcher matcher = filterPattern.matcher(text);
178+
StringBuffer result = new StringBuffer();
179+
180+
while (matcher.find()) {
181+
String match = matcher.group();
182+
String normalized = match.toLowerCase().replaceAll("[^a-zà-ÿ]", "");
183+
184+
if (allowedWords.contains(normalized)) {
185+
matcher.appendReplacement(result, Matcher.quoteReplacement(match));
186+
} else {
187+
matcher.appendReplacement(result, Matcher.quoteReplacement(createMask(match)));
188+
}
166189
}
167-
return result.toString().trim();
190+
matcher.appendTail(result);
191+
return result.toString();
168192
}
169193

170194
/**
@@ -190,6 +214,7 @@ public void reload() {
190214
allowedWords.clear();
191215
errorKeywords.clear();
192216
allFilterWords.clear();
217+
compileFilterPattern();
193218

194219
// Reload from database
195220
loadConfigs();

0 commit comments

Comments
 (0)