mirror of
https://github.com/languagetool-org/languagetool
synced 2026-04-21 13:37:25 +00:00
Add extended sentence range (#9510)
* add new class and fields * refactor LanguageIdentifier * set ExtendedSentenceRange in lt * add ExtendedSentenceRange to CheckResults * implement ExtendedSentenceRange and add them to the response json; Update tests (disable them) TODO: implement tests * start implement tests for MultiLanguage * Update Tests * Update Tests * remove old tests * format files and cleanup * Update PR
This commit is contained in:
parent
c6fd82013d
commit
cbdef0c65c
20 changed files with 815 additions and 472 deletions
|
|
@ -18,46 +18,48 @@
|
|||
*/
|
||||
package org.languagetool;
|
||||
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.languagetool.rules.RuleMatch;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* @since 5.3
|
||||
*/
|
||||
public class CheckResults {
|
||||
|
||||
@Getter
|
||||
private List<RuleMatch> ruleMatches;
|
||||
private List<Range> ignoredRanges;
|
||||
@Getter
|
||||
private final List<Range> ignoredRanges;
|
||||
@Getter
|
||||
private final List<ExtendedSentenceRange> extendedSentenceRanges;
|
||||
private final List<SentenceRange> sentenceRanges = new ArrayList<>();
|
||||
|
||||
|
||||
public CheckResults(List<RuleMatch> ruleMatches, List<Range> ignoredRanges) {
|
||||
this(ruleMatches, ignoredRanges, Collections.emptyList());
|
||||
}
|
||||
|
||||
public CheckResults(List<RuleMatch> ruleMatches, List<Range> ignoredRanges, List<ExtendedSentenceRange> extendedSentenceRanges) {
|
||||
this.ruleMatches = Objects.requireNonNull(ruleMatches);
|
||||
this.ignoredRanges = Objects.requireNonNull(ignoredRanges);
|
||||
}
|
||||
|
||||
public List<Range> getIgnoredRanges() {
|
||||
return ignoredRanges;
|
||||
}
|
||||
|
||||
public List<RuleMatch> getRuleMatches() {
|
||||
return ruleMatches;
|
||||
this.extendedSentenceRanges = Objects.requireNonNull(extendedSentenceRanges.stream().sorted().collect(Collectors.toList()));
|
||||
//TODO: use this later, when we are sure the sentenceRanges (from extendedSentenceRange) are are correct.
|
||||
// Right now the sentenceRanges are calculated different from those in extendedSentenceRange.
|
||||
// extendedSentenceRanges.forEach(extendedSentenceRange -> this.sentenceRanges.add(new SentenceRange(extendedSentenceRange.getFromPos(), extendedSentenceRange.getToPos())));
|
||||
}
|
||||
|
||||
@NotNull
|
||||
public List<SentenceRange> getSentenceRanges() {
|
||||
return sentenceRanges;
|
||||
return Collections.unmodifiableList(this.sentenceRanges);
|
||||
}
|
||||
|
||||
public void addSentenceRanges(List<SentenceRange> sentenceRanges) {
|
||||
this.sentenceRanges.addAll(sentenceRanges);
|
||||
}
|
||||
|
||||
public void setIgnoredRanges(List<Range> ignoredRanges) {
|
||||
this.ignoredRanges = Objects.requireNonNull(ignoredRanges);
|
||||
}
|
||||
|
||||
public void setRuleMatches(List<RuleMatch> ruleMatches) {
|
||||
this.ruleMatches = Objects.requireNonNull(ruleMatches);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* LanguageTool, a natural language style checker
|
||||
* Copyright (c) 2023. Stefan Viol (https://stevio.de)
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
package org.languagetool;
|
||||
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public final class ExtendedSentenceRange implements Comparable<ExtendedSentenceRange> {
|
||||
|
||||
@Getter
|
||||
private final int fromPos;
|
||||
@Getter
|
||||
private final int toPos;
|
||||
@Getter
|
||||
private final Map<String, Float> languageConfidenceRates; //languageCode;0-1 confidenceRate from LanguageDetectionService
|
||||
|
||||
ExtendedSentenceRange(int fromPos, int toPos, String languageCode) {
|
||||
this(fromPos, toPos, Collections.singletonMap(languageCode, 1.0f));
|
||||
}
|
||||
|
||||
ExtendedSentenceRange(int fromPos, int toPos, @NotNull Map<String, Float> languageConfidenceRates) {
|
||||
this.fromPos = fromPos;
|
||||
this.toPos = toPos;
|
||||
this.languageConfidenceRates = new LinkedHashMap<>(languageConfidenceRates);
|
||||
}
|
||||
|
||||
public void updateLanguageConfidenceRates(@NotNull Map<String, Float> languageConfidenceRates) {
|
||||
this.languageConfidenceRates.clear();
|
||||
this.languageConfidenceRates.putAll(languageConfidenceRates);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
ExtendedSentenceRange extendedSentenceRange = (ExtendedSentenceRange) o;
|
||||
return fromPos == extendedSentenceRange.fromPos && toPos == extendedSentenceRange.toPos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = fromPos;
|
||||
result = 31 * result + toPos;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return fromPos + "-" + toPos + ":" + languageConfidenceRates;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull ExtendedSentenceRange o) {
|
||||
return Integer.compare(this.fromPos, o.fromPos);
|
||||
}
|
||||
}
|
||||
|
|
@ -927,7 +927,8 @@ public class JLanguageTool {
|
|||
List<String> sentences = getSentences(annotatedText, tokenizeText);
|
||||
List<AnalyzedSentence> analyzedSentences = analyzeSentences(sentences);
|
||||
CheckResults checkResults = checkInternal(annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
|
||||
checkResults.addSentenceRanges(SentenceRange.getRangesFromSentences(annotatedText, sentences));
|
||||
List<SentenceRange> sentenceRanges = SentenceRange.getRangesFromSentences(annotatedText, sentences);
|
||||
checkResults.addSentenceRanges(sentenceRanges);
|
||||
return checkResults;
|
||||
}
|
||||
|
||||
|
|
@ -1055,8 +1056,7 @@ public class JLanguageTool {
|
|||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return new CheckResults(ruleMatches, res.getIgnoredRanges());
|
||||
return new CheckResults(ruleMatches, res.getIgnoredRanges(), res.getExtendedSentenceRanges());
|
||||
}
|
||||
|
||||
private List<RuleMatch> filterMatches(AnnotatedText annotatedText, RuleSet rules, List<RuleMatch> ruleMatches) {
|
||||
|
|
@ -1936,15 +1936,18 @@ public class JLanguageTool {
|
|||
public CheckResults call() throws Exception {
|
||||
List<RuleMatch> ruleMatches = new ArrayList<>();
|
||||
List<Range> ignoreRanges = new ArrayList<>();
|
||||
List<ExtendedSentenceRange> extendedSentenceRanges = new ArrayList<>();
|
||||
if (mode == Mode.ALL) {
|
||||
ruleMatches.addAll(getTextLevelRuleMatches());
|
||||
CheckResults otherRuleMatches = getOtherRuleMatches(toneTags);
|
||||
ruleMatches.addAll(otherRuleMatches.getRuleMatches());
|
||||
ignoreRanges.addAll(otherRuleMatches.getIgnoredRanges());
|
||||
extendedSentenceRanges.addAll(otherRuleMatches.getExtendedSentenceRanges());
|
||||
} else if (mode == Mode.ALL_BUT_TEXTLEVEL_ONLY) {
|
||||
CheckResults otherRuleMatches = getOtherRuleMatches(toneTags);
|
||||
ruleMatches.addAll(otherRuleMatches.getRuleMatches());
|
||||
ignoreRanges.addAll(otherRuleMatches.getIgnoredRanges());
|
||||
extendedSentenceRanges.addAll(otherRuleMatches.getExtendedSentenceRanges());
|
||||
} else if (mode == Mode.TEXTLEVEL_ONLY) {
|
||||
ruleMatches.addAll(getTextLevelRuleMatches());
|
||||
} else {
|
||||
|
|
@ -1952,7 +1955,7 @@ public class JLanguageTool {
|
|||
}
|
||||
// can't call applyCustomRuleFilters here, done in performCheck ->
|
||||
// should run just once w/ complete list of matches
|
||||
return new CheckResults(ruleMatches, ignoreRanges);
|
||||
return new CheckResults(ruleMatches, ignoreRanges, extendedSentenceRanges);
|
||||
}
|
||||
|
||||
private List<RuleMatch> getTextLevelRuleMatches() throws IOException {
|
||||
|
|
@ -2001,7 +2004,9 @@ public class JLanguageTool {
|
|||
|
||||
private CheckResults getOtherRuleMatches(Set<ToneTag> toneTags) {
|
||||
List<RuleMatch> ruleMatches = new ArrayList<>();
|
||||
List<Range> ignoreRanges = new ArrayList<>();
|
||||
List<Range> ignoreRanges = new ArrayList<>(); //TODO: remove later
|
||||
List<ExtendedSentenceRange> extendedSentenceRanges = new ArrayList<>();
|
||||
|
||||
int textWordCounter = sentences.stream().map(sentenceData -> sentenceData.wordCount).reduce(0, Integer::sum);
|
||||
int wordCounter = 0;
|
||||
float tmpErrorsPerWord = 0.0f;
|
||||
|
|
@ -2009,6 +2014,8 @@ public class JLanguageTool {
|
|||
for (int i = 0, sentencesSize = sentences.size(); i < sentencesSize; i++) {
|
||||
SentenceData sentence = sentences.get(i);
|
||||
wordCounter += sentence.wordCount;
|
||||
ExtendedSentenceRange extendedSentenceRange = new ExtendedSentenceRange(sentence.startOffset, sentence.startOffset + sentence.text.trim().length(), language.getShortCode());
|
||||
extendedSentenceRanges.add(extendedSentenceRange);
|
||||
try {
|
||||
//comment in to trigger an exception via input text:
|
||||
//if (analyzedSentence.getText().contains("fakecrash")) {
|
||||
|
|
@ -2023,7 +2030,6 @@ public class JLanguageTool {
|
|||
sentenceMatches = cache.getIfPresent(cacheKey);
|
||||
}
|
||||
if (sentenceMatches == null) {
|
||||
|
||||
List<Rule> rules = new ArrayList<>(this.rules.rulesForSentence(sentence.analyzed));
|
||||
rules.addAll(userConfig.getRules());
|
||||
sentenceMatches = checkAnalyzedSentence(paraMode, rules, sentence.analyzed, checkRemoteRules, textWordCounter);
|
||||
|
|
@ -2037,11 +2043,13 @@ public class JLanguageTool {
|
|||
}
|
||||
for (RuleMatch elem : sentenceMatches) {
|
||||
RuleMatch thisMatch = adjustRuleMatchPos(elem, sentence.startOffset, sentence.startColumn, sentence.startLine, sentence.text, annotatedText);
|
||||
if (elem.getErrorLimitLang() != null) {
|
||||
Range ignoreRange = new Range(sentence.startOffset, sentence.startOffset + sentence.text.length(), elem.getErrorLimitLang());
|
||||
if (!elem.getNewLanguageMatches().isEmpty()) {
|
||||
//TODO: remove after the addon is updated
|
||||
Range ignoreRange = new Range(sentence.startOffset, sentence.startOffset + sentence.text.length(), elem.getNewLanguageMatches().entrySet().iterator().next().getKey());
|
||||
if (!ignoreRanges.contains(ignoreRange)) {
|
||||
ignoreRanges.add(ignoreRange);
|
||||
}
|
||||
extendedSentenceRange.updateLanguageConfidenceRates(elem.getNewLanguageMatches());
|
||||
}
|
||||
ruleMatches.add(thisMatch);
|
||||
if (listener != null) {
|
||||
|
|
@ -2072,7 +2080,7 @@ public class JLanguageTool {
|
|||
+ StringUtils.abbreviate(sentence.analyzed.toTextString(), 500) + "</sentcontent>", e);
|
||||
}
|
||||
}
|
||||
return new CheckResults(ruleMatches, ignoreRanges);
|
||||
return new CheckResults(ruleMatches, ignoreRanges, extendedSentenceRanges);
|
||||
}
|
||||
|
||||
private LineColumnPosition findLineColumn(int offset) {
|
||||
|
|
|
|||
|
|
@ -18,8 +18,6 @@
|
|||
*/
|
||||
package org.languagetool;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
*/
|
||||
package org.languagetool;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.languagetool.markup.AnnotatedText;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -28,7 +29,7 @@ import java.util.Objects;
|
|||
* A range in a text that makes up a sentence.
|
||||
* @since 5.8
|
||||
*/
|
||||
public class SentenceRange {
|
||||
public class SentenceRange implements Comparable<SentenceRange>{
|
||||
|
||||
private final int fromPos;
|
||||
private final int toPos;
|
||||
|
|
@ -94,4 +95,9 @@ public class SentenceRange {
|
|||
public int hashCode() {
|
||||
return Objects.hash(fromPos, toPos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull SentenceRange o) {
|
||||
return Integer.compare(this.fromPos, o.fromPos);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ import com.optimaize.langdetect.profiles.LanguageProfileReader;
|
|||
import com.optimaize.langdetect.text.RemoveMinorityScriptsTextFilter;
|
||||
import com.optimaize.langdetect.text.TextObjectFactory;
|
||||
import com.optimaize.langdetect.text.TextObjectFactoryBuilder;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.jetbrains.annotations.TestOnly;
|
||||
import org.languagetool.DetectedLanguage;
|
||||
|
|
@ -145,7 +146,7 @@ public class DefaultLanguageIdentifier extends LanguageIdentifier {
|
|||
public AtomicInteger getFasttextInitCounter() {
|
||||
return fasttextInitCounter;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @since 5.2
|
||||
*/
|
||||
|
|
@ -228,20 +229,26 @@ public class DefaultLanguageIdentifier extends LanguageIdentifier {
|
|||
@Nullable
|
||||
@Override
|
||||
public DetectedLanguage detectLanguage(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs) {
|
||||
List<DetectedLanguage> detectedLanguageScores = getDetectedLanguageScores(cleanText, noopLangsTmp, preferredLangsTmp, limitOnPreferredLangs, 1);
|
||||
return detectedLanguageScores.stream().findFirst().orElse(null);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public List<DetectedLanguage> getDetectedLanguageScores(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
|
||||
String text = cleanText;
|
||||
ParsedLanguageLists parsedLanguageLists = prepareDetectLanguage(text, noopLangsTmp, preferredLangsTmp);
|
||||
if (parsedLanguageLists == null) {
|
||||
return new DetectedLanguage(null, new NoopLanguage());
|
||||
return Collections.singletonList(new DetectedLanguage(null, new NoopLanguage()));
|
||||
}
|
||||
List<String> additionalLangs = parsedLanguageLists.getAdditionalLangs();
|
||||
List<String> preferredLangs = parsedLanguageLists.getPreferredLangs();
|
||||
|
||||
Map.Entry<String, Double> result = null;
|
||||
Map<String, Double> scores = null;
|
||||
boolean fasttextFailed = false;
|
||||
String source = "";
|
||||
if (fastTextDetector != null || ngram != null) {
|
||||
try {
|
||||
Map<String, Double> scores;
|
||||
boolean usingFastText = false;
|
||||
if ((text.length() <= SHORT_ALGO_THRESHOLD || fastTextDetector == null) && ngram != null) {
|
||||
scores = ngram.detectLanguages(text.trim(), additionalLangs);
|
||||
|
|
@ -251,13 +258,13 @@ public class DefaultLanguageIdentifier extends LanguageIdentifier {
|
|||
scores = fastTextDetector.runFasttext(text, additionalLangs);
|
||||
source += "fasttext";
|
||||
}
|
||||
result = getHighestScoringResult(scores);
|
||||
/*if (result.getValue().floatValue() < THRESHOLD) {
|
||||
System.out.println("FastText below threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
|
||||
} else {
|
||||
System.out.println("FastText above threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
|
||||
}*/
|
||||
if ((usingFastText && result.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || result.getKey().equals("zz")) {
|
||||
Map.Entry<String, Double> fasttextHighestScoringResult = getHighestScoringResult(scores);
|
||||
if ((usingFastText && fasttextHighestScoringResult.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || fasttextHighestScoringResult.getKey().equals("zz")) {
|
||||
//System.out.println(cleanText + " ->" + result.getValue().floatValue() + " " + result.getKey());
|
||||
Map<Language, Integer> lang2Count = COMMON_WORDS_LANG_IDENTIFIER.getKnownWordsPerLanguage(text);
|
||||
Set<String> baseLangAlreadyHandled = new HashSet<>();
|
||||
|
|
@ -276,32 +283,19 @@ public class DefaultLanguageIdentifier extends LanguageIdentifier {
|
|||
}
|
||||
}
|
||||
source += "+commonwords";
|
||||
result = getHighestScoringResult(scores);
|
||||
}
|
||||
if (preferredLangs.contains("no") && !preferredLangs.contains("da")) {
|
||||
// Special case, as Norwegian easily gets detected as Danish (https://github.com/languagetool-org/languagetool/issues/5520).
|
||||
scores.keySet().removeIf(k -> k.equals("da"));
|
||||
result = getHighestScoringResult(scores);
|
||||
}
|
||||
if (!preferredLangs.isEmpty() && (text.length() <= CONSIDER_ONLY_PREFERRED_THRESHOLD || limitOnPreferredLangs)) {
|
||||
//System.out.println("remove? " + preferredLangs + " <-> " + scores);
|
||||
boolean wasRemoved = scores.keySet().removeIf(k -> !preferredLangs.contains(k));
|
||||
if (wasRemoved && scores.isEmpty() && limitOnPreferredLangs) {
|
||||
//TODO: just to see how often we would return no results because of that parameter -> remove later
|
||||
logger.warn("No language detected for text after remove all not preferred languages from score.");
|
||||
}
|
||||
//System.out.println("-> " + b + " ==> " + scores);
|
||||
result = getHighestScoringResult(scores);
|
||||
//add login was wäre wenn ansonsten hier so lassen
|
||||
source += "+prefLang(forced: " + limitOnPreferredLangs + ")";
|
||||
}
|
||||
// Calculate a trivial confidence value because fasttext's confidence is often
|
||||
// wrong for short cleanText (e.g. 0.99 for a test that's misclassified). Don't
|
||||
// use 1.0 because we can never be totally sure...
|
||||
double newScore = 0.99 / (30.0 / Math.min(text.length(), 30));
|
||||
//System.out.println("fasttext : " + result);
|
||||
//System.out.println("newScore : " + newScore);
|
||||
result = new AbstractMap.SimpleImmutableEntry<>(result.getKey(), newScore);
|
||||
} catch (FastTextDetector.FastTextException e) {
|
||||
if (e.isDisabled()) {
|
||||
fasttextFailed = true;
|
||||
|
|
@ -318,22 +312,47 @@ public class DefaultLanguageIdentifier extends LanguageIdentifier {
|
|||
if (fastTextDetector == null && ngram == null || fasttextFailed) { // no else, value can change in if clause
|
||||
text = textObjectFactory.forText(text).toString();
|
||||
source +="+fallback";
|
||||
result = detectLanguageCode(text, preferredLangs, limitOnPreferredLangs);
|
||||
if (additionalLangs.size() > 0) {
|
||||
logger.warn("Cannot consider noopLanguages because not in fastText mode: " + additionalLangs);
|
||||
if (scores == null) {
|
||||
scores = new HashMap<>();
|
||||
}
|
||||
Map.Entry<String, Double> localResult = detectLanguageCode(text, preferredLangs, limitOnPreferredLangs);
|
||||
if (localResult != null) {
|
||||
scores.put(localResult.getKey(), localResult.getValue());
|
||||
}
|
||||
if (!additionalLangs.isEmpty()) {
|
||||
logger.warn("Cannot consider noopLanguages because not in fastText mode: {}", additionalLangs);
|
||||
}
|
||||
}
|
||||
if (result != null && result.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(result.getKey(), additionalLangs)) {
|
||||
return new DetectedLanguage(null,
|
||||
Languages.getLanguageForShortCode(result.getKey(), additionalLangs),
|
||||
result.getValue().floatValue(), source);
|
||||
|
||||
List<DetectedLanguage> detectedLanguages = new LinkedList<>();
|
||||
if (count > 1) {
|
||||
Map<String, Double> orderedScores = getOrderedScores(scores, count);
|
||||
for (Map.Entry<String, Double> entry : orderedScores.entrySet()) {
|
||||
if (entry.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(entry.getKey(), additionalLangs)) {
|
||||
float rate = Math.round(entry.getValue() * 100.0) / 100.0f; // Convert to a non-scientific float and potentially round down
|
||||
detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(entry.getKey(), additionalLangs), rate, source));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (preferredLangs.size() > 0 && Languages.isLanguageSupported(preferredLangs.get(0))) {
|
||||
source += "+fallbackToPrefLang";
|
||||
return new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, source);
|
||||
Map.Entry<String, Double> highestScoringResult = getHighestScoringResult(scores);
|
||||
if (highestScoringResult.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(highestScoringResult.getKey(), additionalLangs)) {
|
||||
float newScore;
|
||||
if (source.contains("fasttext")) {
|
||||
// Calculate a trivial confidence value because fasttext's confidence is often
|
||||
// wrong for short cleanText (e.g. 0.99 for a test that's misclassified). Don't
|
||||
// use 1.0 because we can never be totally sure...
|
||||
newScore = (float) (0.99/ (30.0 / Math.min(text.length(), 30)));
|
||||
} else {
|
||||
newScore = highestScoringResult.getValue().floatValue();
|
||||
}
|
||||
detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(highestScoringResult.getKey(), additionalLangs), newScore, source));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
if (detectedLanguages.isEmpty() && !preferredLangs.isEmpty() && Languages.isLanguageSupported(preferredLangs.get(0))) {
|
||||
source += "+fallbackToPrefLang";
|
||||
detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, source));
|
||||
}
|
||||
return detectedLanguages;
|
||||
}
|
||||
|
||||
private void reinitFasttextAfterFailure(Exception e) {
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ package org.languagetool.language.identifier;
|
|||
|
||||
import com.optimaize.langdetect.text.TextFilter;
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.languagetool.DetectedLanguage;
|
||||
import org.languagetool.Language;
|
||||
|
|
@ -79,6 +80,9 @@ public abstract class LanguageIdentifier {
|
|||
@Nullable
|
||||
public abstract DetectedLanguage detectLanguage(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs);
|
||||
|
||||
@NotNull
|
||||
public abstract List<DetectedLanguage> getDetectedLanguageScores(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs, int count);
|
||||
|
||||
/**
|
||||
* @param cleanText a cleanText as returned by {@link #cleanAndShortenText(String)}
|
||||
* @return language or {@code null} if language could not be identified
|
||||
|
|
@ -138,6 +142,16 @@ public abstract class LanguageIdentifier {
|
|||
return new AbstractMap.SimpleImmutableEntry<>(result, max);
|
||||
}
|
||||
|
||||
protected Map<String, Double> getOrderedScores(Map<String, Double> scores, int count) {
|
||||
ArrayList<Map.Entry<String, Double>> entries = new ArrayList<>(scores.entrySet());
|
||||
entries.sort(Map.Entry.comparingByValue(Collections.reverseOrder()));
|
||||
Map<String, Double> sortedScores = new LinkedHashMap<>();
|
||||
for (int i = 0; i < entries.size() && i < count; i++) {
|
||||
sortedScores.put(entries.get(i).getKey(), entries.get(i).getValue());
|
||||
}
|
||||
return sortedScores;
|
||||
}
|
||||
|
||||
protected static class ParsedLanguageLists {
|
||||
@Getter
|
||||
private final List<String> additionalLangs = new ArrayList<>();
|
||||
|
|
|
|||
|
|
@ -1,17 +1,17 @@
|
|||
/*
|
||||
* LanguageTool, a natural language style checker
|
||||
* LanguageTool, a natural language style checker
|
||||
* Copyright (c) 2022. Stefan Viol (https://stevio.de)
|
||||
*
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
|
|
@ -21,6 +21,7 @@
|
|||
package org.languagetool.language.identifier;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.languagetool.DetectedLanguage;
|
||||
import org.languagetool.JLanguageTool;
|
||||
|
|
@ -167,6 +168,12 @@ public class SimpleLanguageIdentifier extends LanguageIdentifier {
|
|||
return this.detectLanguage(cleanText, noopLangsTmp, preferredLangsTmp);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public List<DetectedLanguage> getDetectedLanguageScores(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
|
||||
return Collections.singletonList(this.detectLanguage(cleanText, noopLangsTmp, preferredLangsTmp, limitOnPreferredLangs));
|
||||
}
|
||||
|
||||
@Nullable
|
||||
@Override
|
||||
public Language detectLanguage(String cleanText) {
|
||||
|
|
|
|||
|
|
@ -25,8 +25,6 @@ import org.jetbrains.annotations.NotNull;
|
|||
import org.jetbrains.annotations.Nullable;
|
||||
import org.languagetool.AnalyzedSentence;
|
||||
import org.languagetool.ApiCleanupNeeded;
|
||||
import org.languagetool.Language;
|
||||
import org.languagetool.rules.patterns.AbstractPatternRule;
|
||||
import org.languagetool.rules.patterns.PatternRule;
|
||||
import org.languagetool.rules.patterns.PatternRuleMatcher;
|
||||
import org.languagetool.tools.StringTools;
|
||||
|
|
@ -34,7 +32,6 @@ import org.languagetool.tools.StringTools;
|
|||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
|
|
@ -69,7 +66,7 @@ public class RuleMatch implements Comparable<RuleMatch> {
|
|||
private Type type = Type.Other;
|
||||
private SortedMap<String, Float> features = Collections.emptySortedMap();
|
||||
private boolean autoCorrect = false;
|
||||
private String errorLimitLang;
|
||||
private Map<String, Float> newLanguageMatches = new LinkedHashMap<>();
|
||||
|
||||
private String specificRuleId = "";
|
||||
|
||||
|
|
@ -602,22 +599,21 @@ public class RuleMatch implements Comparable<RuleMatch> {
|
|||
/**
|
||||
* The language that the text might be in if the error limit has been reached.
|
||||
*
|
||||
* @since 5.3
|
||||
* @since 6.4
|
||||
*/
|
||||
@Nullable
|
||||
public String getErrorLimitLang() {
|
||||
return errorLimitLang;
|
||||
public Map<String, Float> getNewLanguageMatches() {
|
||||
return newLanguageMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call if the error limit is reached for this sentence. The caller will then get text ranges for the
|
||||
* sentence and can ignore errors there. Note: will not have an effect for text-level rules.
|
||||
*
|
||||
* @param langCode the language this could be instead
|
||||
* @since 5.3
|
||||
* @param newLanguageMatches a map of possible languages this could be instead
|
||||
* @since 6.4
|
||||
*/
|
||||
public void setErrorLimitLang(String langCode) {
|
||||
this.errorLimitLang = langCode;
|
||||
public void setNewLanguageMatches(Map<String, Float> newLanguageMatches) {
|
||||
this.newLanguageMatches = newLanguageMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
package org.languagetool.rules.spelling;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.languagetool.DetectedLanguage;
|
||||
import org.languagetool.Language;
|
||||
import org.languagetool.language.identifier.LanguageIdentifier;
|
||||
|
|
@ -28,20 +29,24 @@ import org.languagetool.language.identifier.LanguageIdentifierService;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Slf4j
|
||||
public class ForeignLanguageChecker {
|
||||
|
||||
private static final float ERROR_THRESHOLD = 0.45f;
|
||||
private static final int MIN_SENTENCE_THRESHOLD = 3;
|
||||
private static final int MAX_SCORING_LANGUAGES = 5;
|
||||
public static final String NO_FOREIGN_LANG_DETECTED = "NO_FOREIGN_LANG_DETECTED";
|
||||
|
||||
|
||||
private final String languageShortCode;
|
||||
private final String sentence;
|
||||
private final long sentenceLength;
|
||||
private final List<String> preferredLanguages;
|
||||
|
||||
|
||||
public ForeignLanguageChecker(String languageShortCode, String sentence, Long sentenceLength, List<String> preferredLanguages) {
|
||||
this.languageShortCode = languageShortCode;
|
||||
this.sentence = sentence;
|
||||
|
|
@ -49,30 +54,38 @@ public class ForeignLanguageChecker {
|
|||
this.preferredLanguages = Collections.unmodifiableList(preferredLanguages);
|
||||
}
|
||||
|
||||
public String check(int matchesSoFar) throws IOException {
|
||||
@NotNull
|
||||
public Map<String, Float> check(int matchesSoFar) throws IOException {
|
||||
float errorRatio = (float) matchesSoFar / sentenceLength;
|
||||
if (sentenceLength >= MIN_SENTENCE_THRESHOLD && errorRatio >= ERROR_THRESHOLD) {
|
||||
LanguageIdentifier langIdent = LanguageIdentifierService.INSTANCE.getInitialized();
|
||||
if (langIdent != null) {
|
||||
DetectedLanguage langDetectResults = langIdent.detectLanguage(sentence, Collections.emptyList(), preferredLanguages);
|
||||
//for now, we just use the result if also in preferredLanguages to prevent false positive
|
||||
if (langDetectResults != null) {
|
||||
Language detectedLanguage = langDetectResults.getDetectedLanguage();
|
||||
if (detectedLanguage != null && !detectedLanguage.getShortCode().equals(languageShortCode) && preferredLanguages.contains(detectedLanguage.getShortCode())) {
|
||||
List<DetectedLanguage> detectedLanguageScores = langIdent.getDetectedLanguageScores(sentence, Collections.emptyList(), preferredLanguages, true, MAX_SCORING_LANGUAGES);
|
||||
Map<String, Float> results = new LinkedHashMap<>(MAX_SCORING_LANGUAGES);
|
||||
if (!detectedLanguageScores.isEmpty()) {
|
||||
for (int i = 0; i < detectedLanguageScores.size(); i++) {
|
||||
DetectedLanguage detectedLanguage = detectedLanguageScores.get(i);
|
||||
Language language = detectedLanguage.getDetectedLanguage();
|
||||
//The text main language still has the highest threshold
|
||||
if (i == 0 && language.getShortCode().equals(languageShortCode)) {
|
||||
return Collections.singletonMap(NO_FOREIGN_LANG_DETECTED, 0.99f);
|
||||
}
|
||||
//DO NEVER enable traceLevel for this class in production @LanguageTool
|
||||
log.trace("Found '{}' sentence in '{}' text: '{}' with confidence {} from source '{}'",
|
||||
detectedLanguage.getShortCode(),
|
||||
languageShortCode,
|
||||
sentence,
|
||||
langDetectResults.getDetectionConfidence(),
|
||||
langDetectResults.getDetectionSource());
|
||||
return detectedLanguage.getShortCode();
|
||||
} else if (detectedLanguage != null && detectedLanguage.getShortCode().equals(languageShortCode)) {
|
||||
return NO_FOREIGN_LANG_DETECTED;
|
||||
language.getShortCode(),
|
||||
languageShortCode,
|
||||
sentence,
|
||||
detectedLanguage.getDetectionConfidence(),
|
||||
detectedLanguage.getDetectionSource());
|
||||
results.put(language.getShortCode(), detectedLanguage.getDetectionConfidence());
|
||||
}
|
||||
return results;
|
||||
} else {
|
||||
return Collections.singletonMap(NO_FOREIGN_LANG_DETECTED, 0.99f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -259,10 +259,10 @@ public class HunspellRule extends SpellingCheckRule {
|
|||
}
|
||||
ruleMatches.add(ruleMatch);
|
||||
if (foreignLanguageChecker != null && !gotResultsFromForeignLanguageChecker) {
|
||||
String langCode = foreignLanguageChecker.check(ruleMatches.size());
|
||||
if (langCode != null) {
|
||||
if (!langCode.equals(ForeignLanguageChecker.NO_FOREIGN_LANG_DETECTED)) {
|
||||
ruleMatches.get(0).setErrorLimitLang(langCode);
|
||||
Map<String, Float> langCodesScoring = foreignLanguageChecker.check(ruleMatches.size());
|
||||
if (!langCodesScoring.isEmpty()) {
|
||||
if (langCodesScoring.get(ForeignLanguageChecker.NO_FOREIGN_LANG_DETECTED) == null) {
|
||||
ruleMatches.get(0).setNewLanguageMatches(langCodesScoring);
|
||||
}
|
||||
gotResultsFromForeignLanguageChecker = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -205,10 +205,10 @@ public abstract class MorfologikSpellerRule extends SpellingCheckRule {
|
|||
isFirstWord = false;
|
||||
}
|
||||
if (foreignLanguageChecker != null && !gotResultsFromForeignLanguageChecker) {
|
||||
String langCode = foreignLanguageChecker.check(ruleMatches.size());
|
||||
if (langCode != null) {
|
||||
if (!langCode.equals(ForeignLanguageChecker.NO_FOREIGN_LANG_DETECTED)) {
|
||||
ruleMatches.get(0).setErrorLimitLang(langCode);
|
||||
Map<String, Float> langCodesScoring = foreignLanguageChecker.check(ruleMatches.size());
|
||||
if (!langCodesScoring.isEmpty()) {
|
||||
if (langCodesScoring.get(ForeignLanguageChecker.NO_FOREIGN_LANG_DETECTED) == null) {
|
||||
ruleMatches.get(0).setNewLanguageMatches(langCodesScoring);
|
||||
}
|
||||
gotResultsFromForeignLanguageChecker = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import java.io.StringWriter;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Write rule matches and some meta information as JSON.
|
||||
|
|
@ -114,6 +115,7 @@ public class RuleMatchesAsJsonSerializer {
|
|||
}
|
||||
writeIgnoreRanges(g, res);
|
||||
writeSentenceRanges(g, res);
|
||||
writeExtendedSentenceRanges(g, res);
|
||||
g.writeEndObject();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
|
|
@ -232,6 +234,29 @@ public class RuleMatchesAsJsonSerializer {
|
|||
g.writeEndArray();
|
||||
}
|
||||
|
||||
private void writeExtendedSentenceRanges(JsonGenerator g, List<CheckResults> res) throws IOException{
|
||||
g.writeArrayFieldStart("extendedSentenceRanges");
|
||||
for (CheckResults r : res) {
|
||||
for (ExtendedSentenceRange range : r.getExtendedSentenceRanges()) {
|
||||
g.writeStartObject();
|
||||
g.writeNumberField("from", range.getFromPos());
|
||||
g.writeNumberField("to", range.getToPos());
|
||||
g.writeArrayFieldStart("detectedLanguages");
|
||||
for (Map.Entry<String, Float> entry : range.getLanguageConfidenceRates().entrySet()) {
|
||||
String language = entry.getKey();
|
||||
Float rate = entry.getValue();
|
||||
g.writeStartObject();
|
||||
g.writeStringField("language", language);
|
||||
g.writeNumberField("rate", rate);
|
||||
g.writeEndObject();
|
||||
}
|
||||
g.writeEndArray();
|
||||
g.writeEndObject();
|
||||
}
|
||||
}
|
||||
g.writeEndArray();
|
||||
}
|
||||
|
||||
private String cleanSuggestion(String s) {
|
||||
if (lang != null) {
|
||||
return lang.toAdvancedTypography(s); //.replaceAll("<suggestion>", lang.getOpeningDoubleQuote()).replaceAll("</suggestion>", lang.getClosingDoubleQuote())
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import com.fasterxml.jackson.core.JsonFactory;
|
|||
import com.fasterxml.jackson.core.JsonGenerator;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.languagetool.DetectedLanguage;
|
||||
import org.languagetool.Language;
|
||||
|
|
@ -36,6 +37,7 @@ import java.io.StringWriter;
|
|||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
|
|
@ -71,6 +73,12 @@ public class RemoteLangDetect extends LanguageIdentifier {
|
|||
return this.detectLanguage(cleanText, noopLangsTmp, preferredLangsTmp);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public List<DetectedLanguage> getDetectedLanguageScores(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
|
||||
return Collections.singletonList(this.detectLanguage(cleanText, noopLangsTmp, preferredLangsTmp));
|
||||
}
|
||||
|
||||
@Nullable
|
||||
@Override
|
||||
public Language detectLanguage(String text) {
|
||||
|
|
|
|||
|
|
@ -360,23 +360,6 @@ public class MorfologikAmericanSpellerRuleTest extends AbstractEnglishSpellerRul
|
|||
assertFalse(rule.isMisspelled("tables"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void testInteractiveMultilingualSignatureCase() throws IOException {
|
||||
String sig = "-- " +
|
||||
"Department of Electrical and Electronic Engineering\n" +
|
||||
"Office XY, Sackville Street Building, The University of Manchester, Manchester\n";
|
||||
List<AnalyzedSentence> analyzedSentences = lt.analyzeText("Hallo Herr Müller, wie geht\n\n" + sig);
|
||||
for (AnalyzedSentence analyzedSentence : analyzedSentences) {
|
||||
RuleMatch[] matches = rule.match(analyzedSentence);
|
||||
System.out.println("===================");
|
||||
System.out.println("S:" + analyzedSentence.getText());
|
||||
for (RuleMatch match : matches) {
|
||||
System.out.println(" getErrorLimitLang: " + match.getErrorLimitLang());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOnlySuggestions() throws IOException {
|
||||
assertThat(rule.getOnlySuggestions("cemetary").size(), is(1));
|
||||
|
|
|
|||
|
|
@ -631,33 +631,6 @@ abstract class TextChecker {
|
|||
hiddenMatches.addAll(ResultExtender.getAsHiddenMatches(allMatches, premiumMatches));
|
||||
}
|
||||
|
||||
//### Start multiLangPart
|
||||
//TODO: implement recheck of ignoreRanges
|
||||
if (isMultiLangEnabled) {
|
||||
log.debug("Not implemented yet");
|
||||
// long startTimeRecheck = System.currentTimeMillis();
|
||||
// Map<String, List<Range>> rangesOrderedByLanguage = new HashMap<>();
|
||||
// res.forEach(checkResults -> {
|
||||
// checkResults.getIgnoredRanges().forEach(range -> {
|
||||
// List<Range> sentenceRanges = rangesOrderedByLanguage.getOrDefault(range.getLang(), new ArrayList<>());
|
||||
// sentenceRanges.add(range);
|
||||
// rangesOrderedByLanguage.put(range.getLang(), sentenceRanges);
|
||||
// });
|
||||
// });
|
||||
// rangesOrderedByLanguage.forEach((shortLangCode, ranges) -> {
|
||||
// Language rangeLanguage = Languages.getLanguageForShortCode(shortLangCode);
|
||||
// StringBuilder languageTextBuilder = new StringBuilder();
|
||||
// ranges.forEach(range -> {
|
||||
// String text = range.getAnalyzedSentence().getText().trim() + " ";
|
||||
// languageTextBuilder.append(text);
|
||||
// });
|
||||
// AnnotatedText finalTextToCheckAgain = new AnnotatedTextBuilder().addText(languageTextBuilder.toString().trim()).build();
|
||||
// });
|
||||
// long endTimeRecheck = System.currentTimeMillis();
|
||||
// log.trace("Time needed for recheck other languages: {}", (endTimeRecheck - startTimeRecheck) / 1000f);
|
||||
}
|
||||
//### End multiLangPart
|
||||
|
||||
int compactMode = Integer.parseInt(params.getOrDefault("c", "0"));
|
||||
String response = getResponse(aText, lang, detLang, motherTongue, res, hiddenMatches, incompleteResultReason, compactMode,
|
||||
limits.getPremiumUid() == null, qParams.mode);
|
||||
|
|
@ -846,36 +819,6 @@ abstract class TextChecker {
|
|||
} else {
|
||||
List<CheckResults> res = new ArrayList<>();
|
||||
res.addAll(getPipelineResults(aText, lang, motherTongue, params, userConfig, listener));
|
||||
// NOTE: Not needed anymore. The "multilingual" parameter is not used.
|
||||
// if (preferredLangs.size() < 2 || parameters.get("multilingual") == null || parameters.get("multilingual").equals("false")) {
|
||||
// res.addAll(getPipelineResults(aText, lang, motherTongue, params, userConfig, listener));
|
||||
// }
|
||||
// else {
|
||||
// // support for multilingual texts:
|
||||
// try {
|
||||
// Language mainLang = getLanguageVariantForCode(detLang.getDetectedLanguage().getShortCode(), preferredVariants);
|
||||
// List<Language> secondLangs = new ArrayList<>();
|
||||
// for (String preferredLangCode : preferredLangs) {
|
||||
// if (!preferredLangCode.equals(mainLang.getShortCode())) {
|
||||
// secondLangs.add(getLanguageVariantForCode(preferredLangCode, preferredVariants));
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// LanguageAnnotator annotator = new LanguageAnnotator();
|
||||
// List<FragmentWithLanguage> fragments = annotator.detectLanguages(aText.getPlainText(), mainLang, secondLangs);
|
||||
// List<Language> langs = new ArrayList<>();
|
||||
// langs.add(mainLang);
|
||||
// langs.addAll(secondLangs);
|
||||
// Map<Language, AnnotatedTextBuilder> lang2builder = getBuilderMap(fragments, new HashSet<>(langs));
|
||||
// for (Map.Entry<Language, AnnotatedTextBuilder> entry : lang2builder.entrySet()) {
|
||||
// res.addAll(getPipelineResults(entry.getValue().build(), entry.getKey(), motherTongue, params, userConfig, listener));
|
||||
// }
|
||||
// } catch (Exception e) {
|
||||
// log.error("Problem with multilingual mode (preferredLangs=" + preferredLangs+ ", preferredVariants=" + preferredVariants + "), " +
|
||||
// "falling back to single language.", e);
|
||||
// res.addAll(getPipelineResults(aText, lang, motherTongue, params, userConfig, listener));
|
||||
// }
|
||||
// }
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,86 +23,228 @@ package org.languagetool.language.multiLanguage;
|
|||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.languagetool.JLanguageTool;
|
||||
import org.languagetool.Languages;
|
||||
import org.languagetool.TestTools;
|
||||
import org.languagetool.UserConfig;
|
||||
import org.languagetool.*;
|
||||
import org.languagetool.language.AmericanEnglish;
|
||||
import org.languagetool.language.GermanyGerman;
|
||||
import org.languagetool.language.identifier.LanguageIdentifier;
|
||||
import org.languagetool.language.identifier.LanguageIdentifierService;
|
||||
import org.languagetool.rules.RuleMatch;
|
||||
import org.languagetool.rules.de.GermanSpellerRule;
|
||||
import org.languagetool.markup.AnnotatedText;
|
||||
import org.languagetool.markup.AnnotatedTextBuilder;
|
||||
import org.languagetool.rules.Rule;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class MultiLanguageTest {
|
||||
|
||||
private final static String fastTextBinary = "/home/stefan/Dokumente/languagetool/data/fasttext/fasttext";
|
||||
private final static String fastTextModel = "/home/stefan/Dokumente/languagetool/data/fasttext/lid.176.bin";
|
||||
private final static String ngramData = "/home/stefan/Dokumente/languagetool/data/model_ml50_new.zip";
|
||||
|
||||
private static final String fastTextBinary = "/home/stefan/Dokumente/languagetool/data/fasttext/fasttext";
|
||||
private static final String fastTextModel = "/home/stefan/Dokumente/languagetool/data/fasttext/lid.176.bin";
|
||||
private static final String ngramData = "/home/stefan/Dokumente/languagetool/data/model_ml50_new.zip";
|
||||
private static final GermanyGerman GERMAN_DE = (GermanyGerman) Languages.getLanguageForShortCode("de-DE");
|
||||
private static final AmericanEnglish ENGLISH_US = (AmericanEnglish) Languages.getLanguageForShortCode("en-US");
|
||||
|
||||
private static List<String> ENGLISH_SENTENCES = Arrays.asList(
|
||||
"He is a very cool guy from Poland.",
|
||||
"How are you?",
|
||||
"But this is English.",
|
||||
"This is so cool.",
|
||||
"How are you my friend?",
|
||||
"Not sure if it's really",
|
||||
//"Nokia Takes Its Peers To Task.", TODO: could not detect this sentence for now
|
||||
"And I’m an English text!");
|
||||
|
||||
private static List<String> GERMAN_SENTENCES = Arrays.asList(
|
||||
"Und er sagte, this is a good test."
|
||||
);
|
||||
private static final UserConfig userConfig = new UserConfig(Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), 0, 0L, null, 0L, null, false, null, null, false, Arrays.asList("en", "de", "fr", "es", "pt", "nl"));
|
||||
private static JLanguageTool germanJLanguageTool;
|
||||
private static JLanguageTool germanJLanguageToolWPL;
|
||||
private static JLanguageTool englishJLanguageTool;
|
||||
private static JLanguageTool englishJLanguageToolWPL;
|
||||
|
||||
@BeforeClass
|
||||
public static void setup() {
|
||||
LanguageIdentifierService.INSTANCE.getDefaultLanguageIdentifier(0, new File(ngramData), new File(fastTextBinary), new File(fastTextModel));
|
||||
|
||||
germanJLanguageTool = new JLanguageTool(GERMAN_DE, null, userConfig);
|
||||
germanJLanguageTool.disableRules(germanJLanguageTool.getAllRules().stream().map(Rule::getId).collect(Collectors.toList()));
|
||||
germanJLanguageTool.enableRule("GERMAN_SPELLER_RULE");
|
||||
germanJLanguageToolWPL = new JLanguageTool(GERMAN_DE);
|
||||
germanJLanguageToolWPL.disableRules(germanJLanguageToolWPL.getAllRules().stream().map(Rule::getId).collect(Collectors.toList()));
|
||||
germanJLanguageToolWPL.enableRule("GERMAN_SPELLER_RULE");
|
||||
|
||||
englishJLanguageTool = new JLanguageTool(ENGLISH_US, null, userConfig);
|
||||
englishJLanguageTool.disableRules(englishJLanguageTool.getAllRules().stream().map(Rule::getId).collect(Collectors.toList()));
|
||||
englishJLanguageTool.enableRule("MORFOLOGIK_RULE_EN_US");
|
||||
englishJLanguageToolWPL = new JLanguageTool(ENGLISH_US);
|
||||
englishJLanguageToolWPL.disableRules(englishJLanguageToolWPL.getAllRules().stream().map(Rule::getId).collect(Collectors.toList()));
|
||||
englishJLanguageToolWPL.enableRule("MORFOLOGIK_RULE_EN_US");
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore("Only run with full LanguageIdentifierService")
|
||||
public void testWithPreferredLanguagesDeAndEn() throws IOException {
|
||||
List<String> preferredLanguages = Arrays.asList("en","de");
|
||||
UserConfig userConfig = new UserConfig(Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), 0, 0L, null, 0L, null, false, null, null, false, preferredLanguages);
|
||||
GermanSpellerRule germanSpellerRule = new GermanSpellerRule(TestTools.getMessages("de"), GERMAN_DE, userConfig, null);
|
||||
JLanguageTool lt = new JLanguageTool(GERMAN_DE);
|
||||
|
||||
//test short sentences
|
||||
int matchCounter = 0;
|
||||
for (String sentence : ENGLISH_SENTENCES) {
|
||||
RuleMatch[] matches = germanSpellerRule.match(lt.getAnalyzedSentence(sentence));
|
||||
for (RuleMatch match : matches) {
|
||||
//only matches in one of the preferred languages are accepted
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("en")) {
|
||||
matchCounter++;
|
||||
break;
|
||||
}
|
||||
@Ignore("Only run with full LanguageIdentifierService (fasttext and ngrams")
|
||||
public void multiLangTest() throws IOException {
|
||||
multiLangHunspellRuleTest();
|
||||
multiLangMorfologikRuleTest();
|
||||
}
|
||||
|
||||
private void multiLangHunspellRuleTest() throws IOException {
|
||||
CheckResults checkResults = germanJLanguageTool.check2(getAnnotatedText(), true, JLanguageTool.ParagraphHandling.NORMAL, null, JLanguageTool.Mode.ALL_BUT_TEXTLEVEL_ONLY, JLanguageTool.Level.DEFAULT, Collections.emptySet(), null);
|
||||
//run 2nd LT for benchmark (first check is always very slow)
|
||||
germanJLanguageToolWPL.check2(getAnnotatedText(), true, JLanguageTool.ParagraphHandling.NORMAL, null, JLanguageTool.Mode.ALL_BUT_TEXTLEVEL_ONLY, JLanguageTool.Level.DEFAULT, Collections.emptySet(), null);
|
||||
testExtendedSentenceRanges(checkResults.getExtendedSentenceRanges());
|
||||
benchmarkMultiLang(germanJLanguageTool, germanJLanguageToolWPL, "de");
|
||||
}
|
||||
|
||||
private void multiLangMorfologikRuleTest() throws IOException {
|
||||
CheckResults checkResults = englishJLanguageTool.check2(getAnnotatedText(), true, JLanguageTool.ParagraphHandling.NORMAL, null, JLanguageTool.Mode.ALL_BUT_TEXTLEVEL_ONLY, JLanguageTool.Level.DEFAULT, Collections.emptySet(), null);
|
||||
//run 2nd LT for benchmark (first check is always very slow)
|
||||
englishJLanguageToolWPL.check2(getAnnotatedText(), true, JLanguageTool.ParagraphHandling.NORMAL, null, JLanguageTool.Mode.ALL_BUT_TEXTLEVEL_ONLY, JLanguageTool.Level.DEFAULT, Collections.emptySet(), null);
|
||||
testExtendedSentenceRanges(checkResults.getExtendedSentenceRanges());
|
||||
benchmarkMultiLang(englishJLanguageTool, englishJLanguageToolWPL, "en");
|
||||
}
|
||||
|
||||
private void testExtendedSentenceRanges(List<ExtendedSentenceRange> extendedSentenceRanges) {
|
||||
assertNotNull(extendedSentenceRanges);
|
||||
assertFalse(extendedSentenceRanges.isEmpty());
|
||||
assertEquals(36, extendedSentenceRanges.size());
|
||||
|
||||
testRangeAndLanguage(0, 27, "de", extendedSentenceRanges.get(0));
|
||||
testRangeAndLanguage(29, 87, "de", extendedSentenceRanges.get(1));
|
||||
testRangeAndLanguage(88, 144, "en", extendedSentenceRanges.get(2));
|
||||
testRangeAndLanguage(145, 171, "de", extendedSentenceRanges.get(3));
|
||||
testRangeAndLanguage(173, 214, "fr", extendedSentenceRanges.get(4));
|
||||
// testRangeAndLanguage(216, 349, "en", extendedSentenceRanges.get(5)); // not detected as non-German sentence with GERMAN_SPELLER_RULE
|
||||
testRangeAndLanguage(351, 476, "fr", extendedSentenceRanges.get(6));
|
||||
testRangeAndLanguage(476, 701, "es", extendedSentenceRanges.get(7));
|
||||
testRangeAndLanguage(701, 865, "nl", extendedSentenceRanges.get(8));
|
||||
testRangeAndLanguage(865, 882, "de", extendedSentenceRanges.get(9));
|
||||
testRangeAndLanguage(882, 902, "en", extendedSentenceRanges.get(10));
|
||||
testRangeAndLanguage(902, 932, "en", extendedSentenceRanges.get(11));
|
||||
testRangeAndLanguage(934, 1062, "pt", extendedSentenceRanges.get(12));
|
||||
// testRangeAndLanguage(1062, 1118, "fr", extendedSentenceRanges.get(13)); // not detected as non-English sentence with MORFOLOGIK_RULE_EN_US
|
||||
testRangeAndLanguage(1119, 1183, "de", extendedSentenceRanges.get(14));
|
||||
testRangeAndLanguage(1184, 1240, "es", extendedSentenceRanges.get(15));
|
||||
testRangeAndLanguage(1241, 1301, "pt", extendedSentenceRanges.get(16));
|
||||
testRangeAndLanguage(1302, 1349, "en", extendedSentenceRanges.get(17));
|
||||
testRangeAndLanguage(1350, 1419, "nl", extendedSentenceRanges.get(18));
|
||||
testRangeAndLanguage(1420, 1481, "nl", extendedSentenceRanges.get(19));
|
||||
testRangeAndLanguage(1481, 1544, "de", extendedSentenceRanges.get(20));
|
||||
testRangeAndLanguage(1545, 1618, "fr", extendedSentenceRanges.get(21));
|
||||
testRangeAndLanguage(1619, 1701, "es", extendedSentenceRanges.get(22));
|
||||
testRangeAndLanguage(1702, 1771, "pt", extendedSentenceRanges.get(23));
|
||||
testRangeAndLanguage(1772, 1843, "en", extendedSentenceRanges.get(24));
|
||||
testRangeAndLanguage(1844, 1936, "nl", extendedSentenceRanges.get(25));
|
||||
testRangeAndLanguage(1936, 2017, "de", extendedSentenceRanges.get(26));
|
||||
testRangeAndLanguage(2018, 2098, "fr", extendedSentenceRanges.get(27));
|
||||
testRangeAndLanguage(2099, 2174, "es", extendedSentenceRanges.get(28));
|
||||
testRangeAndLanguage(2175, 2251, "pt", extendedSentenceRanges.get(29));
|
||||
testRangeAndLanguage(2252, 2318, "en", extendedSentenceRanges.get(30));
|
||||
testRangeAndLanguage(2319, 2385, "fr", extendedSentenceRanges.get(31));
|
||||
// testRangeAndLanguage(2386, 2452, "es", extendedSentenceRanges.get(32)); // not detected as non-English sentence with MORFOLOGIK_RULE_EN_US
|
||||
testRangeAndLanguage(2453, 2524, "pt", extendedSentenceRanges.get(33));
|
||||
testRangeAndLanguage(2525, 2593, "es", extendedSentenceRanges.get(34));
|
||||
testRangeAndLanguage(2594, 2671, "de", extendedSentenceRanges.get(35));
|
||||
}
|
||||
|
||||
private void testRangeAndLanguage(int expectedStart, int expectedEnd, String lang, ExtendedSentenceRange sentence) {
|
||||
assertEquals(expectedStart, sentence.getFromPos());
|
||||
assertEquals(expectedEnd, sentence.getToPos());
|
||||
assertEquals(lang, getLanguageWithHighestConfidenceRate(sentence.getLanguageConfidenceRates()));
|
||||
}
|
||||
|
||||
private void benchmarkMultiLang(JLanguageTool withMultiLang, JLanguageTool withoutMultiLang, String mainLang) throws IOException {
|
||||
String text = "Die romantische Stadt Paris ist für ihre Eiffelturm und köstliche Küche berühmt.\n" +
|
||||
"La romántica ciudad de París es famosa por su Torre Eiffel y su deliciosa cocina.\n" +
|
||||
"The romantic city of Paris is famous for its Eiffel Tower and delicious cuisine.\n" +
|
||||
"L'Espagne attire avec de magnifiques plages sur la côte méditerranéenne.\n" +
|
||||
"A Espanha atrai com belas praias na costa mediterrânea.\n" +
|
||||
"Spain entices with beautiful beaches on the Mediterranean coast.\n" +
|
||||
"Die Niederlande sind für ihre Tulpenfelder und charmanten Windmühlen bekannt.\n" +
|
||||
"Les Pays-Bas sont connus pour leurs champs de tulipes et leurs charmants moulins à vent.\n" +
|
||||
"The Netherlands is known for its tulip fields and charming windmills.\n" +
|
||||
"Nederland staat bekend om zijn tulpenvelden en charmante windmolens." +
|
||||
"Berlin ist die Hauptstadt Deutschlands und hat eine faszinierende Geschichte.\n" +
|
||||
"La gastronomie française est réputée pour ses délicieuses pâtisseries.\n" +
|
||||
"Madrid es conocida por su animada vida nocturna y deliciosa comida tapas.\n" +
|
||||
"O carnaval do Brasil é uma das festas mais animadas do mundo.\n" +
|
||||
"The British countryside is famous for its rolling hills and quaint villages.\n" +
|
||||
"Brugge is beroemd om zijn goed bewaarde middeleeuwse architectuur en kanalen." +
|
||||
"München ist berühmt für sein Oktoberfest, das größte Bierfest der Welt.\n" +
|
||||
"La Tour Eiffel scintille de mille lumières lors de la nuit à Paris.\n" +
|
||||
"En Barcelona, la playa y la arquitectura gótica son impresionantes.\n" +
|
||||
"O futebol é paixão nacional no Brasil, com Pelé sendo uma lenda do esporte.\n" +
|
||||
"London's museums, such as the British Museum and the National Gallery, are world-renowned.\n" +
|
||||
"Les canaux d'Amsterdam sont bordés de maisons étroites à pignons.\n" +
|
||||
"El tango argentino es conocido por su pasión y elegancia en el baile.\n" +
|
||||
"Die Alpen erstrecken sich über mehrere europäische Länder und bieten großartige Skimöglichkeiten.\n" +
|
||||
"La paella es un plato tradicional español que combina arroz, mariscos y azafrán.\n" +
|
||||
"Os moinhos de vento na Holanda são um marco icônico da paisagem rural.";
|
||||
AnnotatedText aText = new AnnotatedTextBuilder().addText(text).build();
|
||||
|
||||
long startCheckWithAdditionalDetection = System.currentTimeMillis();
|
||||
CheckResults withMulti = withMultiLang.check2(aText, true, JLanguageTool.ParagraphHandling.NORMAL, null, JLanguageTool.Mode.ALL_BUT_TEXTLEVEL_ONLY, JLanguageTool.Level.DEFAULT, Collections.emptySet(), null);
|
||||
long endCheckWithAdditionalDetection = System.currentTimeMillis();
|
||||
System.out.println("(" + mainLang + ") " + "Check time with multi language: " + (endCheckWithAdditionalDetection - startCheckWithAdditionalDetection) + "ms");
|
||||
|
||||
long startCheckWithoutAdditionalDetection = System.currentTimeMillis();
|
||||
CheckResults withoutMulti = withoutMultiLang.check2(aText, true, JLanguageTool.ParagraphHandling.NORMAL, null, JLanguageTool.Mode.ALL_BUT_TEXTLEVEL_ONLY, JLanguageTool.Level.DEFAULT, Collections.emptySet(), null);
|
||||
long endCheckWithoutAdditionalDetection = System.currentTimeMillis();
|
||||
System.out.println("(" + mainLang + ") " + "Check time without multi language: " + (endCheckWithoutAdditionalDetection - startCheckWithoutAdditionalDetection) + "ms");
|
||||
|
||||
assertEquals(1, withMultiLang.getAllActiveRules().size());
|
||||
assertEquals(1, withoutMultiLang.getAllActiveRules().size());
|
||||
|
||||
Set<String> detectedLanguages = withMulti.getExtendedSentenceRanges().stream().map(extendedSentenceRange -> getLanguageWithHighestConfidenceRate(extendedSentenceRange.getLanguageConfidenceRates())).collect(Collectors.toSet());
|
||||
assertEquals(6, detectedLanguages.size());
|
||||
|
||||
withoutMulti.getExtendedSentenceRanges().forEach(extendedSentenceRange -> {
|
||||
assertEquals(mainLang, getLanguageWithHighestConfidenceRate(extendedSentenceRange.getLanguageConfidenceRates()));
|
||||
});
|
||||
}
|
||||
|
||||
private AnnotatedText getAnnotatedText() {
|
||||
String text = "Hallo Herr Müller, wie geht\n\n" + // 0 - 27
|
||||
"Das ist jetzt deutsch und wird auch die Hauptsprache sein.\n" + // 29 - 87
|
||||
"This Text is in english but the other one was in german.\n" + // 88 - 144
|
||||
"Hier wieder etwas deutsch.\n" + // 145 - 171
|
||||
"\n" +
|
||||
"La page que vous cherchez est introuvable\n" + // 173 - 214
|
||||
"\n" +
|
||||
"-- " +
|
||||
"Department of Electrical and Electronic Engineering\n" +
|
||||
"Office XY, Sackville Street Building, The University of Manchester, Manchester\n\n" + // 216 - 349
|
||||
"Wikipédia est un projet d’encyclopédie collective en ligne, universelle, multilingue et fonctionnant sur le principe du wiki." + // 351 -476
|
||||
"Anderson creció junto a su familia primero en el sur de Estados Unidos, luego se establecieron por un tiempo en Missuri y finalmente en Kansas hasta que se emancipó en 1862, manteniéndose mediante el robo y venta de caballos." + // 476 - 701
|
||||
"Wikipedia is een online encyclopedie die ernaar streeft informatie te bieden in alle erkende talen ter wereld, die vrij herbruikbaar, objectief en verifieerbaar is." + // 701 - 865
|
||||
"Ein schöner Satz." + // 865 - 882
|
||||
"But this is English." + // 882 - 902
|
||||
"Tom, could we meet next Monday\n\n" + // 902 - 932
|
||||
"Este novo website tem como objetivo fornecer toda a informação aos seus utilizadores, da forma mais clara e atualizada possível." + // 934 - 1062
|
||||
"La tour Eiffel est un symbole emblématique de la France.\n" + // 1062 - 1118
|
||||
"Die schöne Küste Spaniens ist ein beliebtes Reiseziel im Sommer.\n" + // 1119 - 1183
|
||||
"Las playas de España son famosas por su belleza natural.\n" + // 1184 - 1240
|
||||
"As praias de Portugal são conhecidas pela sua areia dourada.\n" + // 1241 - 1301
|
||||
"The Eiffel Tower is an iconic symbol of France.\n" + // 1302 - 1349
|
||||
"De prachtige kust van Spanje is een populaire bestemming in de zomer.\n" + // 1350 - 1419
|
||||
"De stranden van Nederland zijn ideaal voor lange wandelingen." + // 1420 - 1481
|
||||
"Die Seine schlängelt sich durch die belebten Straßen von Paris.\n" + // 1481 - 1544
|
||||
"La cuisine française est renommée pour sa délicieuse variété de fromages.\n" + // 1545 - 1618
|
||||
"Barcelona es famosa por su arquitectura modernista, incluyendo la Sagrada Familia.\n" + // 1619 - 1701
|
||||
"A cidade do Rio de Janeiro é conhecida por suas praias deslumbrantes.\n" + // 1702 - 1771
|
||||
"London is famous for its iconic red double-decker buses and black cabs.\n" + // 1772 - 1843
|
||||
"Amsterdam staat bekend om zijn schilderachtige grachten en fietsvriendelijke infrastructuur." + // 1844 - 1936
|
||||
"Die deutsche Autobahn ist weltweit berühmt für ihre Geschwindigkeitsbegrenzungen.\n" + // 1936 - 2017
|
||||
"La cuisine française est réputée pour sa délicatesse et sa diversité de saveurs.\n" + // 2018 - 2098
|
||||
"El tango argentino es una danza apasionada que refleja la cultura del país.\n" + // 2099 - 2174
|
||||
"A culinária portuguesa é conhecida por pratos como o bacalhau à Gomes de Sá.\n" + // 2175 - 2251
|
||||
"The Big Ben clock tower in London is an iconic symbol of the city.\n" + // 2252 - 2318
|
||||
"Les canaux d'Amsterdam offrent des balades pittoresques en bateau.\n" + // 2319 - 2385
|
||||
"El flamenco es un género musical y de baile tradicional en España.\n" + // 2386 - 2452
|
||||
"Os campos de tulipas na Holanda criam paisagens coloridas na primavera.\n" + // 2453 - 2524
|
||||
"La paella española es famosa por su mezcla de sabores y su colorido.\n" + // 2525 - 2593
|
||||
"Der Schwarzwald in Deutschland ist berühmt für seine dichten Wälder und Seen."; // 2594 - 2671
|
||||
return new AnnotatedTextBuilder().addText(text).build();
|
||||
}
|
||||
|
||||
private String getLanguageWithHighestConfidenceRate(Map<String, Float> languages) {
|
||||
final float[] top = {-1f};
|
||||
final String[] lang = {""};
|
||||
languages.forEach((l, r) -> {
|
||||
if (r > top[0]) {
|
||||
top[0] = r;
|
||||
lang[0] = l;
|
||||
}
|
||||
}
|
||||
assertEquals("Not all foreign sentences detected", ENGLISH_SENTENCES.size(), matchCounter);
|
||||
matchCounter = 0;
|
||||
for (String sentence : GERMAN_SENTENCES) {
|
||||
RuleMatch[] matches = germanSpellerRule.match(lt.getAnalyzedSentence(sentence));
|
||||
for (RuleMatch match : matches) {
|
||||
//only matches in one of the preferred languages are accepted
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("de")) {
|
||||
matchCounter++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals("False positive detected languages", 0, matchCounter);
|
||||
});
|
||||
return lang[0];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,134 +0,0 @@
|
|||
/*
|
||||
* LanguageTool, a natural language style checker
|
||||
* Copyright (c) 2022. Stefan Viol (https://stevio.de)
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
package org.languagetool.rules;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.languagetool.AnalyzedSentence;
|
||||
import org.languagetool.JLanguageTool;
|
||||
import org.languagetool.Languages;
|
||||
import org.languagetool.TestTools;
|
||||
import org.languagetool.language.AmericanEnglish;
|
||||
import org.languagetool.language.GermanyGerman;
|
||||
import org.languagetool.language.identifier.LanguageIdentifierService;
|
||||
import org.languagetool.rules.de.GermanSpellerRule;
|
||||
import org.languagetool.rules.en.MorfologikAmericanSpellerRule;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class MultiLanguageTextTest {
|
||||
|
||||
private static final GermanyGerman GERMAN_DE = (GermanyGerman) Languages.getLanguageForShortCode("de-DE");
|
||||
private static final AmericanEnglish ENGLISH_US = (AmericanEnglish) Languages.getLanguageForShortCode("en-US");
|
||||
|
||||
private static MorfologikAmericanSpellerRule morfologikAmericanSpellerRule;
|
||||
private static GermanSpellerRule germanSpellerRule;
|
||||
@BeforeClass
|
||||
public static void setup() throws IOException {
|
||||
germanSpellerRule = new GermanSpellerRule(TestTools.getMessages("de"), GERMAN_DE);
|
||||
morfologikAmericanSpellerRule = new MorfologikAmericanSpellerRule(TestTools.getMessages("en"), ENGLISH_US);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore //TODO: need rework: works only with preferred languages in userConfig
|
||||
public void testEnglishInGermanDetected() throws IOException {
|
||||
JLanguageTool lt = new JLanguageTool(GERMAN_DE);
|
||||
RuleMatch[] matches1 = germanSpellerRule.match(lt.getAnalyzedSentence("He is a very cool guy from Poland."));
|
||||
boolean match1Found = false;
|
||||
for (RuleMatch match : matches1) {
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("en")) {
|
||||
match1Found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue("It was expected to find a match.", match1Found);
|
||||
|
||||
RuleMatch[] matches2 = germanSpellerRule.match(lt.getAnalyzedSentence("How are you?"));
|
||||
boolean match2Found = false;
|
||||
for (RuleMatch match : matches2) {
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("en")) {
|
||||
match2Found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue("It was expected to find a match.", match2Found);
|
||||
|
||||
RuleMatch[] matches3 = germanSpellerRule.match(lt.getAnalyzedSentence("CONFIDENTIALITY NOTICE:"));
|
||||
boolean match3Found = false;
|
||||
for (RuleMatch match : matches3) {
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("en")) {
|
||||
match3Found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue("It was expected to find a match.", match3Found);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore //TODO: need rework: works only with preferred languages in userConfig
|
||||
public void testWithLanguageIdentifier() throws IOException {
|
||||
LanguageIdentifierService.INSTANCE.getDefaultLanguageIdentifier(1000, new File("/home/stefan/Dokumente/languagetool/data/model_ml50_new.zip"), new File("/home/stefan/Dokumente/languagetool/data/fasttext/fasttext"), new File("/home/stefan/Dokumente/languagetool/data/fasttext/lid.176.bin"));
|
||||
JLanguageTool lt = new JLanguageTool(GERMAN_DE);
|
||||
|
||||
RuleMatch[] matchesFr = germanSpellerRule.match(lt.getAnalyzedSentence("Wikipédia est un projet d’encyclopédie collective en ligne, universelle, multilingue et fonctionnant sur le principe du wiki."));
|
||||
RuleMatch lastMatchFr = matchesFr[matchesFr.length - 1];
|
||||
assertEquals(lastMatchFr.getErrorLimitLang(), "fr");
|
||||
|
||||
RuleMatch[] matchesEs = germanSpellerRule.match(lt.getAnalyzedSentence("Anderson creció junto a su familia primero en el sur de Estados Unidos, luego se establecieron por un tiempo en Missuri y finalmente en Kansas hasta que se emancipó en 1862, manteniéndose mediante el robo y venta de caballos."));
|
||||
RuleMatch lastMatchEs = matchesEs[matchesEs.length - 1];
|
||||
assertEquals(lastMatchEs.getErrorLimitLang(), "es");
|
||||
|
||||
RuleMatch[] matchesNl = germanSpellerRule.match(lt.getAnalyzedSentence("Wikipedia is een online encyclopedie die ernaar streeft informatie te bieden in alle erkende talen ter wereld, die vrij herbruikbaar, objectief en verifieerbaar is."));
|
||||
RuleMatch lastMatchNl = matchesNl[matchesNl.length - 1];
|
||||
assertEquals(lastMatchNl.getErrorLimitLang(), "nl");
|
||||
|
||||
LanguageIdentifierService.INSTANCE.clearLanguageIdentifier("both"); //clear for next test
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore("Moved from MorfologikAmericanSpellerRuleTest and run with langIdentifier")
|
||||
// case: signature is (mostly) English, user starts typing in German -> first, EN is detected for whole text
|
||||
public void testMultilingualSignatureCase() throws IOException {
|
||||
LanguageIdentifierService.INSTANCE.getDefaultLanguageIdentifier(1000, new File("/home/stefan/Dokumente/languagetool/data/model_ml50_new.zip"), new File("/home/stefan/Dokumente/languagetool/data/fasttext/fasttext"), new File("/home/stefan/Dokumente/languagetool/data/fasttext/lid.176.bin"));
|
||||
JLanguageTool lt = new JLanguageTool(ENGLISH_US);
|
||||
String sig = "-- " +
|
||||
"Department of Electrical and Electronic Engineering\n" +
|
||||
"Office XY, Sackville Street Building, The University of Manchester, Manchester\n";
|
||||
assertZZ("Hallo Herr Müller, wie geht\n\n" + sig, lt); // "Herr" and "Müller" are accepted by EN speller
|
||||
assertZZ("Hallo Frau Müller, wie\n\n" + sig, lt); // "Frau" and "Müller" are accepted by EN speller
|
||||
assertZZ("Hallo Frau Sauer, wie\n\n" + sig, lt);
|
||||
LanguageIdentifierService.INSTANCE.clearLanguageIdentifier("both"); //clear for next test
|
||||
}
|
||||
|
||||
private void assertZZ(String input, JLanguageTool lt) throws IOException {
|
||||
List<AnalyzedSentence> analyzedSentences = lt.analyzeText(input);
|
||||
assertThat(analyzedSentences.size(), is(2));
|
||||
assertThat(morfologikAmericanSpellerRule.match(analyzedSentences.get(0))[0].getErrorLimitLang(), is("de"));
|
||||
assertNull(morfologikAmericanSpellerRule.match(analyzedSentences.get(1))[0].getErrorLimitLang());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
/* LanguageTool, a natural language style checker
|
||||
* Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de)
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
package org.languagetool.rules.de;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.languagetool.AnalyzedSentence;
|
||||
import org.languagetool.JLanguageTool;
|
||||
import org.languagetool.Languages;
|
||||
import org.languagetool.TestTools;
|
||||
import org.languagetool.language.German;
|
||||
import org.languagetool.language.GermanyGerman;
|
||||
import org.languagetool.rules.RuleMatch;
|
||||
import org.languagetool.rules.spelling.hunspell.HunspellRule;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class GermanSpellerRuleTest {
|
||||
|
||||
private static final German GERMAN_DE = (German) Languages.getLanguageForShortCode("de-DE");
|
||||
|
||||
@Test
|
||||
@Ignore //TODO: need rework: works only with preferred languages in userConfig
|
||||
public void testErrorLimitReached() throws IOException {
|
||||
HunspellRule rule1 = new GermanSpellerRule(TestTools.getMessages("de"), GERMAN_DE);
|
||||
JLanguageTool lt = new JLanguageTool(GERMAN_DE);
|
||||
RuleMatch[] matches1 = rule1.match(lt.getAnalyzedSentence("Ein schöner Satz."));
|
||||
assertThat(matches1.length, is(0));
|
||||
RuleMatch[] matches2 = rule1.match(lt.getAnalyzedSentence("But this is English."));
|
||||
assertThat(matches2.length, is(4));
|
||||
boolean match2Found = false;
|
||||
for (RuleMatch match : matches2) {
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("en")) {
|
||||
match2Found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(match2Found);
|
||||
RuleMatch[] matches3 = rule1.match(lt.getAnalyzedSentence("Und er sagte, this is a good test."));
|
||||
assertThat(matches3.length, is(4));
|
||||
boolean match3Found = false;
|
||||
for (RuleMatch match : matches3) {
|
||||
if (match.getErrorLimitLang() != null && match.getErrorLimitLang().equals("en")) {
|
||||
match3Found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertFalse(match3Found);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore //TODO: need rework: works only with preferred languages in userConfig
|
||||
// case: signature is (mostly) English, user starts typing in German -> first, EN is detected for whole text
|
||||
// Also see MorfologikAmericanSpellerRuleTest
|
||||
public void testMultilingualSignatureCase() throws IOException {
|
||||
JLanguageTool lt = new JLanguageTool(GERMAN_DE);
|
||||
HunspellRule rule = new GermanSpellerRule(TestTools.getMessages("de"), GERMAN_DE);
|
||||
String sig = "-- " +
|
||||
"Das ist eine deutsche Signatur.\n" +
|
||||
"Eigentlich egal, was hier genau steht. Aber es reicht, um den Gesamttext als deutsch zu erkennen.\n";
|
||||
//assertZZ(lt, rule, "Hi Tom, I'm happy to discuss the\n\n" + sig); // "Hi Tom, I'm happy" also accepted by German speller
|
||||
//assertZZ(lt, rule, "Tom, I'm happy to discuss the\n\n" + sig);
|
||||
assertZZ(lt, rule, "Tom, could we meet next Monday\n\n" + sig);
|
||||
}
|
||||
|
||||
private void assertZZ(JLanguageTool lt, HunspellRule rule, String input) throws IOException {
|
||||
List<AnalyzedSentence> analyzedSentences = lt.analyzeText(input);
|
||||
RuleMatch[] matches = rule.match(analyzedSentences.get(0));
|
||||
/*System.out.println("--> " + input);
|
||||
for (RuleMatch ruleMatch : matches) {
|
||||
System.out.println(">>>" + ruleMatch.getRule().getId() + " " + ruleMatch.getErrorLimitLang());
|
||||
}*/
|
||||
assertThat(analyzedSentences.size(), is(4));
|
||||
assertThat(matches.length, is(5));
|
||||
boolean hasErrorLimitLang = false;
|
||||
for (RuleMatch rm : matches) {
|
||||
if (rm.getErrorLimitLang() != null && rm.getErrorLimitLang().equals("en")) {
|
||||
hasErrorLimitLang = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue("Should have at least one match with errorLimitLang == \"en\"", hasErrorLimitLang);
|
||||
}
|
||||
|
||||
}
|
||||
342
multiLanguageSupport.md
Normal file
342
multiLanguageSupport.md
Normal file
|
|
@ -0,0 +1,342 @@
|
|||
### How does LanguageTool support texts in multiple languages?
|
||||
|
||||
Starting with Version 6.4 LanguageTool improves the handling of texts written in multiple languages.
|
||||
|
||||
We currently set the language for a text before checking, or it can be set by the user in the add-on.
|
||||
|
||||
This means that individual sentences that differ from the main language have many matches, even though they contain no errors. Or errors are not recognized because the correct language for these sentences was not used during checking.
|
||||
|
||||
And right here, we have now added a mechanism that allows sentences with too many spelling errors to be checked again by our languageDetectionService.
|
||||
|
||||
The previously founded errors remain in the CheckResults object. Additionally, we put the new ExtendedSentenceRange data in CheckResults object.
|
||||
|
||||
For example, the text:
|
||||
> Das ist Deutsch. This is english.
|
||||
|
||||
The corresponding request to the API would be:
|
||||
> https://languagetool.org/v2/check?text=Das%20ist%20Deutsch.%20This%20is%20english.&language=auto&&preferredLanguages=de,en
|
||||
|
||||
Our server will detect this text as German and send this response:
|
||||
|
||||
```json
|
||||
{
|
||||
"software":{
|
||||
"name":"LanguageTool",
|
||||
"version":"6.4-SNAPSHOT",
|
||||
"buildDate":null,
|
||||
"apiVersion":1,
|
||||
"premium":false,
|
||||
"status":""
|
||||
},
|
||||
"warnings":{
|
||||
"incompleteResults":false
|
||||
},
|
||||
"language":{
|
||||
"name":"German (Germany)",
|
||||
"code":"de-DE",
|
||||
"detectedLanguage":{
|
||||
"name":"German (Germany)",
|
||||
"code":"de-DE",
|
||||
"confidence":0.99775517,
|
||||
"source":"ngram+prefLang(forced: false)"
|
||||
}
|
||||
},
|
||||
"matches":[
|
||||
{
|
||||
"message":"Möglicher Tippfehler gefunden.",
|
||||
"shortMessage":"Rechtschreibfehler",
|
||||
"replacements":[
|
||||
{
|
||||
"value":"Typ"
|
||||
},
|
||||
{
|
||||
"value":"Taz"
|
||||
},
|
||||
{
|
||||
"value":"Typs"
|
||||
},
|
||||
{
|
||||
"value":"Die"
|
||||
},
|
||||
{
|
||||
"value":"Tim"
|
||||
},
|
||||
{
|
||||
"value":"Des"
|
||||
},
|
||||
{
|
||||
"value":"Tims"
|
||||
},
|
||||
{
|
||||
"value":"Das"
|
||||
},
|
||||
{
|
||||
"value":"Thais"
|
||||
},
|
||||
{
|
||||
"value":"Bis"
|
||||
},
|
||||
{
|
||||
"value":"TVs"
|
||||
},
|
||||
{
|
||||
"value":"Dass"
|
||||
},
|
||||
{
|
||||
"value":"TTs"
|
||||
},
|
||||
{
|
||||
"value":"Dies"
|
||||
},
|
||||
{
|
||||
"value":"TiB"
|
||||
},
|
||||
{
|
||||
"value":"Hin"
|
||||
},
|
||||
{
|
||||
"value":"Theiß"
|
||||
},
|
||||
{
|
||||
"value":"Teil"
|
||||
},
|
||||
{
|
||||
"value":"Thies"
|
||||
},
|
||||
{
|
||||
"value":"Dahin"
|
||||
}
|
||||
],
|
||||
"offset":17,
|
||||
"length":4,
|
||||
"context":{
|
||||
"text":"Das ist Deutsch. This is english.",
|
||||
"offset":17,
|
||||
"length":4
|
||||
},
|
||||
"sentence":"This is english.",
|
||||
"type":{
|
||||
"typeName":"UnknownWord"
|
||||
},
|
||||
"rule":{
|
||||
"id":"GERMAN_SPELLER_RULE",
|
||||
"description":"Möglicher Rechtschreibfehler",
|
||||
"issueType":"misspelling",
|
||||
"category":{
|
||||
"id":"TYPOS",
|
||||
"name":"Mögliche Tippfehler"
|
||||
}
|
||||
},
|
||||
"ignoreForIncompleteSentence":false,
|
||||
"contextForSureMatch":0
|
||||
},
|
||||
{
|
||||
"message":"Möglicher Tippfehler gefunden.",
|
||||
"shortMessage":"Rechtschreibfehler",
|
||||
"replacements":[
|
||||
{
|
||||
"value":"ist"
|
||||
},
|
||||
{
|
||||
"value":"IS"
|
||||
},
|
||||
{
|
||||
"value":"die"
|
||||
},
|
||||
{
|
||||
"value":"in"
|
||||
},
|
||||
{
|
||||
"value":"im",
|
||||
"shortDescription":"Positionsangabe"
|
||||
},
|
||||
{
|
||||
"value":"mit"
|
||||
},
|
||||
{
|
||||
"value":"ein"
|
||||
},
|
||||
{
|
||||
"value":"bis"
|
||||
},
|
||||
{
|
||||
"value":"es"
|
||||
},
|
||||
{
|
||||
"value":"sie"
|
||||
},
|
||||
{
|
||||
"value":"wie"
|
||||
},
|
||||
{
|
||||
"value":"ihm",
|
||||
"shortDescription":"Dativ von 'er'"
|
||||
},
|
||||
{
|
||||
"value":"ihn"
|
||||
},
|
||||
{
|
||||
"value":"ihr"
|
||||
},
|
||||
{
|
||||
"value":"ins"
|
||||
},
|
||||
{
|
||||
"value":"hin"
|
||||
},
|
||||
{
|
||||
"value":"ich"
|
||||
},
|
||||
{
|
||||
"value":"nie"
|
||||
},
|
||||
{
|
||||
"value":"wir"
|
||||
},
|
||||
{
|
||||
"value":"bin"
|
||||
}
|
||||
],
|
||||
"offset":22,
|
||||
"length":2,
|
||||
"context":{
|
||||
"text":"Das ist Deutsch. This is english.",
|
||||
"offset":22,
|
||||
"length":2
|
||||
},
|
||||
"sentence":"This is english.",
|
||||
"type":{
|
||||
"typeName":"UnknownWord"
|
||||
},
|
||||
"rule":{
|
||||
"id":"GERMAN_SPELLER_RULE",
|
||||
"description":"Möglicher Rechtschreibfehler",
|
||||
"issueType":"misspelling",
|
||||
"category":{
|
||||
"id":"TYPOS",
|
||||
"name":"Mögliche Tippfehler"
|
||||
}
|
||||
},
|
||||
"ignoreForIncompleteSentence":false,
|
||||
"contextForSureMatch":0
|
||||
},
|
||||
{
|
||||
"message":"Möglicher Tippfehler gefunden.",
|
||||
"shortMessage":"Rechtschreibfehler",
|
||||
"replacements":[
|
||||
{
|
||||
"value":"englisch"
|
||||
},
|
||||
{
|
||||
"value":"englische"
|
||||
},
|
||||
{
|
||||
"value":"endlich"
|
||||
},
|
||||
{
|
||||
"value":"englisch-"
|
||||
},
|
||||
{
|
||||
"value":"entlieh"
|
||||
},
|
||||
{
|
||||
"value":"Anglist"
|
||||
},
|
||||
{
|
||||
"value":"Denglisch"
|
||||
},
|
||||
{
|
||||
"value":"Englisch"
|
||||
},
|
||||
{
|
||||
"value":"anglich"
|
||||
},
|
||||
{
|
||||
"value":"anglisch"
|
||||
},
|
||||
{
|
||||
"value":"enolisch"
|
||||
},
|
||||
{
|
||||
"value":"denglisch"
|
||||
}
|
||||
],
|
||||
"offset":25,
|
||||
"length":7,
|
||||
"context":{
|
||||
"text":"Das ist Deutsch. This is english.",
|
||||
"offset":25,
|
||||
"length":7
|
||||
},
|
||||
"sentence":"This is english.",
|
||||
"type":{
|
||||
"typeName":"UnknownWord"
|
||||
},
|
||||
"rule":{
|
||||
"id":"GERMAN_SPELLER_RULE",
|
||||
"description":"Möglicher Rechtschreibfehler",
|
||||
"issueType":"misspelling",
|
||||
"category":{
|
||||
"id":"TYPOS",
|
||||
"name":"Mögliche Tippfehler"
|
||||
}
|
||||
},
|
||||
"ignoreForIncompleteSentence":false,
|
||||
"contextForSureMatch":0
|
||||
}
|
||||
],
|
||||
"ignoreRanges":[
|
||||
{
|
||||
"from":17,
|
||||
"to":33,
|
||||
"language":{
|
||||
"code":"en"
|
||||
}
|
||||
}
|
||||
],
|
||||
"sentenceRanges":[
|
||||
[
|
||||
0,
|
||||
16
|
||||
],
|
||||
[
|
||||
17,
|
||||
33
|
||||
]
|
||||
],
|
||||
"extendedSentenceRanges":[
|
||||
{
|
||||
"from":0,
|
||||
"to":16,
|
||||
"detectedLanguages":[
|
||||
{
|
||||
"language":"de",
|
||||
"rate":1
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"from":17,
|
||||
"to":33,
|
||||
"detectedLanguages":[
|
||||
{
|
||||
"language":"en",
|
||||
"rate":1
|
||||
},
|
||||
{
|
||||
"language":"de",
|
||||
"rate":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
We already have ignoreRanges and sentenceRanges in the response. The extendedSentenceRanges are an extended combination of both.
|
||||
- one extendedSentenceRanges for each sentence (will replace the sentenceRanges)
|
||||
- at least one detected language per sentence (will replace the ignoreRanges → if the detected language does not equal the main language)
|
||||
|
||||
IMPORTANT: The additional language detection for each sentence only works if the user has at least 2 preferredLanguages and the detected language of a sentence is one of the preferredLanguages.
|
||||
|
||||
Client applications can use this information to trigger a new check for each sentence that differs from the main language.
|
||||
Loading…
Reference in a new issue