Merge pull request #11655 from languagetool-org/custom-rules-jlanguagetool

extend JLanguageTool to allow customising used rules easier
This commit is contained in:
Fabian Richter 2025-11-26 11:35:10 +01:00 committed by GitHub
commit 1d0e61a3cc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 106 additions and 52 deletions

View file

@ -298,7 +298,8 @@ public class JLanguageTool {
GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging) {
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, true, false);
}
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
@ -317,25 +318,51 @@ public class JLanguageTool {
* @since 6.6
*/
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel) {
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, inputLogging, withLanguageModel, null);
}
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param language the language of the text to be checked
* @param altLanguages The languages that are accepted as alternative languages - currently this means
* words are accepted if they are in an alternative language and not similar to
* a word from {@code language}. If there's a similar word in {@code language},
* there will be an error of type {@link RuleMatch.Type#Hint} (EXPERIMENTAL)
* @param motherTongue the user's mother tongue, used for false friend rules, or <code>null</code>.
* The mother tongue may also be used as a source language for checking bilingual texts.
* @param cache a cache to speed up checking if the same sentences get checked more than once,
* e.g. when LT is running as a server and texts are re-checked due to changes
* @param inputLogging allow inclusion of input in logs on exceptions
* @param withLanguageModel will not call updateOptionalLanguageModelRules(null) if this is true
* @param customRules rules to use for the JLanguageTool instance instead of initializing with the built-in ones, or null to use built-in rules
* @since 6.6
*/
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel, List<Rule> customRules) {
this.language = Objects.requireNonNull(language, "language cannot be null");
this.altLanguages = Objects.requireNonNull(altLanguages, "altLanguages cannot be null (but empty)");
this.motherTongue = motherTongue;
this.userConfig = Objects.requireNonNullElseGet(userConfig, UserConfig::new);
this.globalConfig = globalConfig;
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
this.cleanOverlappingMatches = true;
try {
activateDefaultPatternRules();
if (!language.hasNGramFalseFriendRule(motherTongue)) {
// use the old false friends, which always match, not depending on context
activateDefaultFalseFriendRules();
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
if (customRules != null) {
builtinRules = new ArrayList<>(customRules);
} else {
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
try {
activateDefaultPatternRules();
if (!language.hasNGramFalseFriendRule(motherTongue)) {
// use the old false friends, which always match, not depending on context
activateDefaultFalseFriendRules();
}
if (!withLanguageModel) {
updateOptionalLanguageModelRules(null); // start out with rules without language model
}
} catch (Exception e) {
throw new RuntimeException("Could not activate rules", e);
}
if (!withLanguageModel) {
updateOptionalLanguageModelRules(null); // start out with rules without language model
}
} catch (Exception e) {
throw new RuntimeException("Could not activate rules", e);
}
this.cache = cache;
descProvider = new ShortDescriptionProvider();
@ -775,6 +802,20 @@ public class JLanguageTool {
ruleSetCache.clear();
}
/**
* Updates the rules for the system by replacing the user-defined rules with the provided set of rules.
* Clears any existing user and built-in rules, as well as the cached rule set, before applying the new rules.
*
* @param rules a list of Rule objects to be set as the new user-defined rules
* @since 6.8
*/
public void setRules(List<Rule> rules) {
builtinRules.clear();
userRules.clear();
userRules.addAll(rules);
ruleSetCache.clear();
}
/**
* Disable the given rule category so the check methods like {@link #check(String)} won't use it.
*
@ -1038,9 +1079,15 @@ public class JLanguageTool {
}
protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
return checkInternalWithCustomRules(rules, annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
}
public CheckResults checkInternalWithCustomRules(RuleSet rules, AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
if (printStream != null) {
printIfVerbose(rules.allRules().size() + " rules activated for language " + language);
}
@ -1304,7 +1351,7 @@ public class JLanguageTool {
if (matches == null) {
continue;
}
if (cache != null && result.isSuccess()) {
if (cache != null && result.isSuccess() && result.adjustOffsets()) {
// store in cache
InputSentence cacheKey = new InputSentence(
sentence, language, motherTongue, disabledRules, disabledRuleCategories,
@ -1318,8 +1365,10 @@ public class JLanguageTool {
// clone matches before adjusting offsets
// match objects could be relevant to multiple (duplicate) sentences at different offsets
List<RuleMatch> adjustedMatches = matches.stream().map(RuleMatch::new).collect(Collectors.toList());
for (RuleMatch match : adjustedMatches) {
adjustOffset(annotatedText, offset, match);
if (result.adjustOffsets()) {
for (RuleMatch match : adjustedMatches) {
adjustOffset(annotatedText, offset, match);
}
}
remoteMatches.addAll(adjustedMatches);
}

View file

@ -210,7 +210,7 @@ public abstract class RemoteRule extends Rule {
filteredMatches.addAll(filteredSentenceMatches);
}
}
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
}
List<RuleMatch> filteredMatches = new ArrayList<>();
@ -221,7 +221,7 @@ public abstract class RemoteRule extends Rule {
filteredMatches.addAll(filteredSentenceMatches);
}
}
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
return result;
});
}

View file

@ -29,6 +29,8 @@ import java.util.*;
public class RemoteRuleResult {
private final boolean remote; // was remote needed/involved? rules may filter input sentences and only call remote on some; for metrics
private final boolean success; // successful -> for caching, so that we can cache: remote not needed for this sentence
private final boolean adjustOffsets; // whether rule matches are relative to each sentence and need to be adjusted further
// or already use the positions from the analyzed sentence and don't need to be adjusted
private final List<RuleMatch> matches;
private final Set<AnalyzedSentence> processedSentences;
// which sentences were processed? to distinguish between no matches because not processed (e.g. cached)
@ -36,9 +38,10 @@ public class RemoteRuleResult {
private final Map<AnalyzedSentence, List<RuleMatch>> sentenceMatches = new HashMap<>();
public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
public RemoteRuleResult(boolean remote, boolean success, boolean adjustOffsets, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
this.remote = remote;
this.success = success;
this.adjustOffsets = adjustOffsets;
this.matches = matches;
this.processedSentences = Collections.unmodifiableSet(new HashSet<>(processedSentences));
@ -54,6 +57,10 @@ public class RemoteRuleResult {
}
}
public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
this(remote, success, true, matches, processedSentences);
}
public boolean isRemote() {
return remote;
}
@ -62,6 +69,10 @@ public class RemoteRuleResult {
return success;
}
public boolean adjustOffsets() {
return adjustOffsets;
}
public List<RuleMatch> getMatches() {
return matches;
}

View file

@ -44,9 +44,13 @@ import org.slf4j.MDC;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;
@ -262,6 +266,27 @@ abstract class TextChecker {
}
}
/**
* Hash a string deterministically into a 64-bit signed long; use textSessionIdParam if set, fall back to client IP.
*/
protected static Long computeTextSessionID(String textSessionIdParam, String ip) {
String input = textSessionIdParam != null ? textSessionIdParam : ip;
if (input == null) {
return null;
}
try {
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] bytes = md.digest(input.getBytes(StandardCharsets.UTF_8));
ByteBuffer buffer = ByteBuffer.wrap(bytes);
Long textSessionId = buffer.getLong();
return textSessionId;
} catch (NoSuchAlgorithmException e) {
// Should not happen for SHA-256, wrap in a runtime exception
throw new RuntimeException("SHA-256 not supported", e);
}
}
private void prewarmPipelinePool() {
// setting + number of pipelines
// typical addon settings at the moment (2018-11-05)
@ -433,38 +458,7 @@ abstract class TextChecker {
boolean filterDictionaryMatches = "true".equals(params.getOrDefault("filterDictionaryMatches", "true"));
Long textSessionId = null;
try {
if (params.containsKey("textSessionId")) {
String textSessionIdStr = params.get("textSessionId");
if (textSessionIdStr.startsWith("user:")) {
int sepPos = textSessionIdStr.indexOf(':');
String sessionId = textSessionIdStr.substring(sepPos + 1);
textSessionId = Long.valueOf(sessionId);
} else if (textSessionIdStr.contains(":")) { // transitioning to new format used in chrome addon
// format: "{random number in 0..99999}:{unix time}"
long random, timestamp;
int sepPos = textSessionIdStr.indexOf(':');
random = Long.parseLong(textSessionIdStr.substring(0, sepPos));
timestamp = Long.parseLong(textSessionIdStr.substring(sepPos + 1));
// use random number to choose a slice in possible range of values
// then choose position in slice by timestamp
long maxRandom = 100000;
long randomSegmentSize = (Long.MAX_VALUE - maxRandom) / maxRandom;
long segmentOffset = random * randomSegmentSize;
if (timestamp > randomSegmentSize) {
log.warn(String.format("Could not transform textSessionId '%s'", textSessionIdStr));
}
textSessionId = segmentOffset + timestamp;
} else {
textSessionId = Long.valueOf(textSessionIdStr);
}
}
} catch (NumberFormatException ex) {
log.info("Could not parse textSessionId '" + params.get("textSessionId") + "' as long: " + ex.getMessage() +
", user agent: " + params.get("useragent") + ", version: " + params.get("v") +
", HTTP user agent: " + getHttpUserAgent(httpExchange) + ", referrer: " + getHttpReferrer(httpExchange));
}
Long textSessionId = computeTextSessionID(params.get("textSessionId"), remoteAddress);
List<String> abTest = AB_TEST_SERVICE.getActiveAbTestForClient(params, config);