mirror of
https://github.com/languagetool-org/languagetool
synced 2026-04-21 13:37:25 +00:00
Merge pull request #11655 from languagetool-org/custom-rules-jlanguagetool
extend JLanguageTool to allow customising used rules easier
This commit is contained in:
commit
1d0e61a3cc
4 changed files with 106 additions and 52 deletions
|
|
@ -298,7 +298,8 @@ public class JLanguageTool {
|
|||
GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging) {
|
||||
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, true, false);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Create a JLanguageTool and setup the built-in rules for the
|
||||
* given language and false friend rules for the text language / mother tongue pair.
|
||||
|
|
@ -317,25 +318,51 @@ public class JLanguageTool {
|
|||
* @since 6.6
|
||||
*/
|
||||
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel) {
|
||||
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, inputLogging, withLanguageModel, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a JLanguageTool and setup the built-in rules for the
|
||||
* given language and false friend rules for the text language / mother tongue pair.
|
||||
*
|
||||
* @param language the language of the text to be checked
|
||||
* @param altLanguages The languages that are accepted as alternative languages - currently this means
|
||||
* words are accepted if they are in an alternative language and not similar to
|
||||
* a word from {@code language}. If there's a similar word in {@code language},
|
||||
* there will be an error of type {@link RuleMatch.Type#Hint} (EXPERIMENTAL)
|
||||
* @param motherTongue the user's mother tongue, used for false friend rules, or <code>null</code>.
|
||||
* The mother tongue may also be used as a source language for checking bilingual texts.
|
||||
* @param cache a cache to speed up checking if the same sentences get checked more than once,
|
||||
* e.g. when LT is running as a server and texts are re-checked due to changes
|
||||
* @param inputLogging allow inclusion of input in logs on exceptions
|
||||
* @param withLanguageModel will not call updateOptionalLanguageModelRules(null) if this is true
|
||||
* @param customRules rules to use for the JLanguageTool instance instead of initializing with the built-in ones, or null to use built-in rules
|
||||
* @since 6.6
|
||||
*/
|
||||
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel, List<Rule> customRules) {
|
||||
this.language = Objects.requireNonNull(language, "language cannot be null");
|
||||
this.altLanguages = Objects.requireNonNull(altLanguages, "altLanguages cannot be null (but empty)");
|
||||
this.motherTongue = motherTongue;
|
||||
this.userConfig = Objects.requireNonNullElseGet(userConfig, UserConfig::new);
|
||||
this.globalConfig = globalConfig;
|
||||
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
|
||||
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
|
||||
this.cleanOverlappingMatches = true;
|
||||
try {
|
||||
activateDefaultPatternRules();
|
||||
if (!language.hasNGramFalseFriendRule(motherTongue)) {
|
||||
// use the old false friends, which always match, not depending on context
|
||||
activateDefaultFalseFriendRules();
|
||||
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
|
||||
if (customRules != null) {
|
||||
builtinRules = new ArrayList<>(customRules);
|
||||
} else {
|
||||
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
|
||||
try {
|
||||
activateDefaultPatternRules();
|
||||
if (!language.hasNGramFalseFriendRule(motherTongue)) {
|
||||
// use the old false friends, which always match, not depending on context
|
||||
activateDefaultFalseFriendRules();
|
||||
}
|
||||
if (!withLanguageModel) {
|
||||
updateOptionalLanguageModelRules(null); // start out with rules without language model
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Could not activate rules", e);
|
||||
}
|
||||
if (!withLanguageModel) {
|
||||
updateOptionalLanguageModelRules(null); // start out with rules without language model
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Could not activate rules", e);
|
||||
}
|
||||
this.cache = cache;
|
||||
descProvider = new ShortDescriptionProvider();
|
||||
|
|
@ -775,6 +802,20 @@ public class JLanguageTool {
|
|||
ruleSetCache.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the rules for the system by replacing the user-defined rules with the provided set of rules.
|
||||
* Clears any existing user and built-in rules, as well as the cached rule set, before applying the new rules.
|
||||
*
|
||||
* @param rules a list of Rule objects to be set as the new user-defined rules
|
||||
* @since 6.8
|
||||
*/
|
||||
public void setRules(List<Rule> rules) {
|
||||
builtinRules.clear();
|
||||
userRules.clear();
|
||||
userRules.addAll(rules);
|
||||
ruleSetCache.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Disable the given rule category so the check methods like {@link #check(String)} won't use it.
|
||||
*
|
||||
|
|
@ -1038,9 +1079,15 @@ public class JLanguageTool {
|
|||
}
|
||||
|
||||
protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
|
||||
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
|
||||
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
|
||||
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
|
||||
return checkInternalWithCustomRules(rules, annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
|
||||
}
|
||||
|
||||
public CheckResults checkInternalWithCustomRules(RuleSet rules, AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
|
||||
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
|
||||
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
|
||||
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
|
||||
if (printStream != null) {
|
||||
printIfVerbose(rules.allRules().size() + " rules activated for language " + language);
|
||||
}
|
||||
|
|
@ -1304,7 +1351,7 @@ public class JLanguageTool {
|
|||
if (matches == null) {
|
||||
continue;
|
||||
}
|
||||
if (cache != null && result.isSuccess()) {
|
||||
if (cache != null && result.isSuccess() && result.adjustOffsets()) {
|
||||
// store in cache
|
||||
InputSentence cacheKey = new InputSentence(
|
||||
sentence, language, motherTongue, disabledRules, disabledRuleCategories,
|
||||
|
|
@ -1318,8 +1365,10 @@ public class JLanguageTool {
|
|||
// clone matches before adjusting offsets
|
||||
// match objects could be relevant to multiple (duplicate) sentences at different offsets
|
||||
List<RuleMatch> adjustedMatches = matches.stream().map(RuleMatch::new).collect(Collectors.toList());
|
||||
for (RuleMatch match : adjustedMatches) {
|
||||
adjustOffset(annotatedText, offset, match);
|
||||
if (result.adjustOffsets()) {
|
||||
for (RuleMatch match : adjustedMatches) {
|
||||
adjustOffset(annotatedText, offset, match);
|
||||
}
|
||||
}
|
||||
remoteMatches.addAll(adjustedMatches);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -210,7 +210,7 @@ public abstract class RemoteRule extends Rule {
|
|||
filteredMatches.addAll(filteredSentenceMatches);
|
||||
}
|
||||
}
|
||||
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
|
||||
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
|
||||
}
|
||||
|
||||
List<RuleMatch> filteredMatches = new ArrayList<>();
|
||||
|
|
@ -221,7 +221,7 @@ public abstract class RemoteRule extends Rule {
|
|||
filteredMatches.addAll(filteredSentenceMatches);
|
||||
}
|
||||
}
|
||||
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
|
||||
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
|
||||
return result;
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@ import java.util.*;
|
|||
public class RemoteRuleResult {
|
||||
private final boolean remote; // was remote needed/involved? rules may filter input sentences and only call remote on some; for metrics
|
||||
private final boolean success; // successful -> for caching, so that we can cache: remote not needed for this sentence
|
||||
private final boolean adjustOffsets; // whether rule matches are relative to each sentence and need to be adjusted further
|
||||
// or already use the positions from the analyzed sentence and don't need to be adjusted
|
||||
private final List<RuleMatch> matches;
|
||||
private final Set<AnalyzedSentence> processedSentences;
|
||||
// which sentences were processed? to distinguish between no matches because not processed (e.g. cached)
|
||||
|
|
@ -36,9 +38,10 @@ public class RemoteRuleResult {
|
|||
|
||||
private final Map<AnalyzedSentence, List<RuleMatch>> sentenceMatches = new HashMap<>();
|
||||
|
||||
public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
|
||||
public RemoteRuleResult(boolean remote, boolean success, boolean adjustOffsets, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
|
||||
this.remote = remote;
|
||||
this.success = success;
|
||||
this.adjustOffsets = adjustOffsets;
|
||||
this.matches = matches;
|
||||
this.processedSentences = Collections.unmodifiableSet(new HashSet<>(processedSentences));
|
||||
|
||||
|
|
@ -54,6 +57,10 @@ public class RemoteRuleResult {
|
|||
}
|
||||
}
|
||||
|
||||
public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
|
||||
this(remote, success, true, matches, processedSentences);
|
||||
}
|
||||
|
||||
public boolean isRemote() {
|
||||
return remote;
|
||||
}
|
||||
|
|
@ -62,6 +69,10 @@ public class RemoteRuleResult {
|
|||
return success;
|
||||
}
|
||||
|
||||
public boolean adjustOffsets() {
|
||||
return adjustOffsets;
|
||||
}
|
||||
|
||||
public List<RuleMatch> getMatches() {
|
||||
return matches;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,9 +44,13 @@ import org.slf4j.MDC;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
|
@ -262,6 +266,27 @@ abstract class TextChecker {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hash a string deterministically into a 64-bit signed long; use textSessionIdParam if set, fall back to client IP.
|
||||
*/
|
||||
protected static Long computeTextSessionID(String textSessionIdParam, String ip) {
|
||||
String input = textSessionIdParam != null ? textSessionIdParam : ip;
|
||||
if (input == null) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
MessageDigest md = MessageDigest.getInstance("SHA-256");
|
||||
byte[] bytes = md.digest(input.getBytes(StandardCharsets.UTF_8));
|
||||
|
||||
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
||||
Long textSessionId = buffer.getLong();
|
||||
return textSessionId;
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
// Should not happen for SHA-256, wrap in a runtime exception
|
||||
throw new RuntimeException("SHA-256 not supported", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void prewarmPipelinePool() {
|
||||
// setting + number of pipelines
|
||||
// typical addon settings at the moment (2018-11-05)
|
||||
|
|
@ -433,38 +458,7 @@ abstract class TextChecker {
|
|||
|
||||
boolean filterDictionaryMatches = "true".equals(params.getOrDefault("filterDictionaryMatches", "true"));
|
||||
|
||||
Long textSessionId = null;
|
||||
try {
|
||||
if (params.containsKey("textSessionId")) {
|
||||
String textSessionIdStr = params.get("textSessionId");
|
||||
if (textSessionIdStr.startsWith("user:")) {
|
||||
int sepPos = textSessionIdStr.indexOf(':');
|
||||
String sessionId = textSessionIdStr.substring(sepPos + 1);
|
||||
textSessionId = Long.valueOf(sessionId);
|
||||
} else if (textSessionIdStr.contains(":")) { // transitioning to new format used in chrome addon
|
||||
// format: "{random number in 0..99999}:{unix time}"
|
||||
long random, timestamp;
|
||||
int sepPos = textSessionIdStr.indexOf(':');
|
||||
random = Long.parseLong(textSessionIdStr.substring(0, sepPos));
|
||||
timestamp = Long.parseLong(textSessionIdStr.substring(sepPos + 1));
|
||||
// use random number to choose a slice in possible range of values
|
||||
// then choose position in slice by timestamp
|
||||
long maxRandom = 100000;
|
||||
long randomSegmentSize = (Long.MAX_VALUE - maxRandom) / maxRandom;
|
||||
long segmentOffset = random * randomSegmentSize;
|
||||
if (timestamp > randomSegmentSize) {
|
||||
log.warn(String.format("Could not transform textSessionId '%s'", textSessionIdStr));
|
||||
}
|
||||
textSessionId = segmentOffset + timestamp;
|
||||
} else {
|
||||
textSessionId = Long.valueOf(textSessionIdStr);
|
||||
}
|
||||
}
|
||||
} catch (NumberFormatException ex) {
|
||||
log.info("Could not parse textSessionId '" + params.get("textSessionId") + "' as long: " + ex.getMessage() +
|
||||
", user agent: " + params.get("useragent") + ", version: " + params.get("v") +
|
||||
", HTTP user agent: " + getHttpUserAgent(httpExchange) + ", referrer: " + getHttpReferrer(httpExchange));
|
||||
}
|
||||
Long textSessionId = computeTextSessionID(params.get("textSessionId"), remoteAddress);
|
||||
|
||||
List<String> abTest = AB_TEST_SERVICE.getActiveAbTestForClient(params, config);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue