Merge pull request #11655 from languagetool-org/custom-rules-jlanguagetool

extend JLanguageTool to allow customising used rules easier
2026-04-21 13:37:25 +00:00 · 2025-11-26 11:35:10 +01:00 · 2025-11-26 11:35:10 +01:00 · 1d0e61a3cc
commit 1d0e61a3cc
parent 0e9972207c 96da4cf0e5
4 changed files with 106 additions and 52 deletions
--- a/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
+++ b/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
@ -298,7 +298,8 @@ public class JLanguageTool {
                       GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging) {
    this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, true, false);
  }
-  
+
+
  /**
   * Create a JLanguageTool and setup the built-in rules for the
   * given language and false friend rules for the text language / mother tongue pair.
@ -317,25 +318,51 @@ public class JLanguageTool {
   * @since 6.6
   */
  public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel) {
+      this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, inputLogging, withLanguageModel, null);
+  }
+
+  /**
+   * Create a JLanguageTool and setup the built-in rules for the
+   * given language and false friend rules for the text language / mother tongue pair.
+   *
+   * @param language     the language of the text to be checked
+   * @param altLanguages The languages that are accepted as alternative languages - currently this means
+   *                     words are accepted if they are in an alternative language and not similar to
+   *                     a word from {@code language}. If there's a similar word in {@code language},
+   *                     there will be an error of type {@link RuleMatch.Type#Hint} (EXPERIMENTAL)
+   * @param motherTongue the user's mother tongue, used for false friend rules, or <code>null</code>.
+   *          The mother tongue may also be used as a source language for checking bilingual texts.
+   * @param cache a cache to speed up checking if the same sentences get checked more than once,
+   *              e.g. when LT is running as a server and texts are re-checked due to changes
+   * @param inputLogging allow inclusion of input in logs on exceptions
+   * @param withLanguageModel will not call updateOptionalLanguageModelRules(null) if this is true
+   * @param customRules rules to use for the JLanguageTool instance instead of initializing with the built-in ones, or null to use built-in rules
+   * @since 6.6
+   */
+  public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel, List<Rule> customRules) {
    this.language = Objects.requireNonNull(language, "language cannot be null");
    this.altLanguages = Objects.requireNonNull(altLanguages, "altLanguages cannot be null (but empty)");
    this.motherTongue = motherTongue;
    this.userConfig = Objects.requireNonNullElseGet(userConfig, UserConfig::new);
    this.globalConfig = globalConfig;
-    ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
-    builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
    this.cleanOverlappingMatches = true;
-    try {
-      activateDefaultPatternRules();
-      if (!language.hasNGramFalseFriendRule(motherTongue)) {
-        // use the old false friends, which always match, not depending on context
-        activateDefaultFalseFriendRules();
+    ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
+    if (customRules != null) {
+      builtinRules = new ArrayList<>(customRules);
+    } else {
+      builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
+      try {
+        activateDefaultPatternRules();
+        if (!language.hasNGramFalseFriendRule(motherTongue)) {
+          // use the old false friends, which always match, not depending on context
+          activateDefaultFalseFriendRules();
+        }
+        if (!withLanguageModel) {
+          updateOptionalLanguageModelRules(null); // start out with rules without language model
+        }
+      } catch (Exception e) {
+        throw new RuntimeException("Could not activate rules", e);
      }
-      if (!withLanguageModel) {
-        updateOptionalLanguageModelRules(null); // start out with rules without language model
-      }
-    } catch (Exception e) {
-      throw new RuntimeException("Could not activate rules", e);
    }
    this.cache = cache;
    descProvider = new ShortDescriptionProvider();
@ -775,6 +802,20 @@ public class JLanguageTool {
    ruleSetCache.clear();
  }

+  /**
+   * Updates the rules for the system by replacing the user-defined rules with the provided set of rules.
+   * Clears any existing user and built-in rules, as well as the cached rule set, before applying the new rules.
+   *
+   * @param rules a list of Rule objects to be set as the new user-defined rules
+   * @since 6.8
+   */
+  public void setRules(List<Rule> rules) {
+    builtinRules.clear();
+    userRules.clear();
+    userRules.addAll(rules);
+    ruleSetCache.clear();
+  }
+
  /**
   * Disable the given rule category so the check methods like {@link #check(String)} won't use it.
   *
@ -1038,9 +1079,15 @@ public class JLanguageTool {
  }

  protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
+                                       Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
+                                       @Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
+    RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
+    return checkInternalWithCustomRules(rules, annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
+  }
+
+  public CheckResults checkInternalWithCustomRules(RuleSet rules, AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
                                     Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
                                     @Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
-    RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
    if (printStream != null) {
      printIfVerbose(rules.allRules().size() + " rules activated for language " + language);
    }
@ -1304,7 +1351,7 @@ public class JLanguageTool {
      if (matches == null) {
        continue;
      }
-      if (cache != null && result.isSuccess()) {
+      if (cache != null && result.isSuccess() && result.adjustOffsets()) {
        // store in cache
        InputSentence cacheKey = new InputSentence(
          sentence, language, motherTongue, disabledRules, disabledRuleCategories,
@ -1318,8 +1365,10 @@ public class JLanguageTool {
      // clone matches before adjusting offsets
      // match objects could be relevant to multiple (duplicate) sentences at different offsets
      List<RuleMatch> adjustedMatches = matches.stream().map(RuleMatch::new).collect(Collectors.toList());
-      for (RuleMatch match : adjustedMatches) {
-        adjustOffset(annotatedText, offset, match);
+      if (result.adjustOffsets()) {
+        for (RuleMatch match : adjustedMatches) {
+          adjustOffset(annotatedText, offset, match);
+        }
      }
      remoteMatches.addAll(adjustedMatches);
    }
--- a/languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java
@ -210,7 +210,7 @@ public abstract class RemoteRule extends Rule {
            filteredMatches.addAll(filteredSentenceMatches);
          }
        }
-        result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
+        result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
      }

      List<RuleMatch> filteredMatches = new ArrayList<>();
@ -221,7 +221,7 @@ public abstract class RemoteRule extends Rule {
          filteredMatches.addAll(filteredSentenceMatches);
        }
      }
-      result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
+      result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
      return result;
    });
  }
--- a/languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleResult.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleResult.java
@ -29,6 +29,8 @@ import java.util.*;
 public class RemoteRuleResult {
  private final boolean remote; // was remote needed/involved? rules may filter input sentences and only call remote on some; for metrics
  private final boolean success; // successful -> for caching, so that we can cache: remote not needed for this sentence
+  private final boolean adjustOffsets; // whether rule matches are relative to each sentence and need to be adjusted further
+  // or already use the positions from the analyzed sentence and don't need to be adjusted
  private final List<RuleMatch> matches;
  private final Set<AnalyzedSentence> processedSentences;
  // which sentences were processed? to distinguish between no matches because not processed (e.g. cached)
@ -36,9 +38,10 @@ public class RemoteRuleResult {

  private final Map<AnalyzedSentence, List<RuleMatch>> sentenceMatches = new HashMap<>();

-  public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
+  public RemoteRuleResult(boolean remote, boolean success, boolean adjustOffsets, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
    this.remote = remote;
    this.success = success;
+    this.adjustOffsets = adjustOffsets;
    this.matches = matches;
    this.processedSentences = Collections.unmodifiableSet(new HashSet<>(processedSentences));

@ -54,6 +57,10 @@ public class RemoteRuleResult {
    }
  }

+  public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
+    this(remote, success, true, matches, processedSentences);
+  }
+
  public boolean isRemote() {
    return remote;
  }
@ -62,6 +69,10 @@ public class RemoteRuleResult {
    return success;
  }

+  public boolean adjustOffsets() {
+    return adjustOffsets;
+  }
+
  public List<RuleMatch> getMatches() {
    return matches;
  }
--- a/languagetool-server/src/main/java/org/languagetool/server/TextChecker.java
+++ b/languagetool-server/src/main/java/org/languagetool/server/TextChecker.java
@ -44,9 +44,13 @@ import org.slf4j.MDC;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.net.HttpURLConnection;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 import java.util.*;
 import java.util.concurrent.*;
 import java.util.regex.Pattern;
@ -262,6 +266,27 @@ abstract class TextChecker {
    }
  }

+  /**
+   * Hash a string deterministically into a 64-bit signed long; use textSessionIdParam if set, fall back to client IP.
+   */
+  protected static Long computeTextSessionID(String textSessionIdParam, String ip) {
+      String input = textSessionIdParam != null ? textSessionIdParam : ip;
+      if (input == null) {
+        return null;
+      }
+      try {
+        MessageDigest md = MessageDigest.getInstance("SHA-256");
+        byte[] bytes = md.digest(input.getBytes(StandardCharsets.UTF_8));
+
+        ByteBuffer buffer = ByteBuffer.wrap(bytes);
+        Long textSessionId = buffer.getLong();
+        return textSessionId;
+      } catch (NoSuchAlgorithmException e) {
+        // Should not happen for SHA-256, wrap in a runtime exception
+        throw new RuntimeException("SHA-256 not supported", e);
+      }
+  }
+
  private void prewarmPipelinePool() {
    // setting + number of pipelines
    // typical addon settings at the moment (2018-11-05)
@ -433,38 +458,7 @@ abstract class TextChecker {

    boolean filterDictionaryMatches = "true".equals(params.getOrDefault("filterDictionaryMatches", "true"));

-    Long textSessionId = null;
-    try {
-      if (params.containsKey("textSessionId")) {
-        String textSessionIdStr = params.get("textSessionId");
-        if (textSessionIdStr.startsWith("user:")) {
-          int sepPos = textSessionIdStr.indexOf(':');
-          String sessionId = textSessionIdStr.substring(sepPos + 1);
-          textSessionId = Long.valueOf(sessionId);
-        } else if (textSessionIdStr.contains(":")) { // transitioning to new format used in chrome addon
-          // format: "{random number in 0..99999}:{unix time}"
-          long random, timestamp;
-          int sepPos = textSessionIdStr.indexOf(':');
-          random = Long.parseLong(textSessionIdStr.substring(0, sepPos));
-          timestamp = Long.parseLong(textSessionIdStr.substring(sepPos + 1));
-          // use random number to choose a slice in possible range of values
-          // then choose position in slice by timestamp
-          long maxRandom = 100000;
-          long randomSegmentSize = (Long.MAX_VALUE - maxRandom) / maxRandom;
-          long segmentOffset = random * randomSegmentSize;
-          if (timestamp > randomSegmentSize) {
-            log.warn(String.format("Could not transform textSessionId '%s'", textSessionIdStr));
-          }
-          textSessionId = segmentOffset + timestamp;
-        } else {
-          textSessionId = Long.valueOf(textSessionIdStr);
-        }
-      }
-    } catch (NumberFormatException ex) {
-      log.info("Could not parse textSessionId '" + params.get("textSessionId") + "' as long: " + ex.getMessage() +
-        ", user agent: " + params.get("useragent") + ", version: " + params.get("v") +
-        ", HTTP user agent: " + getHttpUserAgent(httpExchange) + ", referrer: " + getHttpReferrer(httpExchange));
-    }
+    Long textSessionId = computeTextSessionID(params.get("textSessionId"), remoteAddress);

    List<String> abTest = AB_TEST_SERVICE.getActiveAbTestForClient(params, config);