mirror of
https://github.com/languagetool-org/languagetool
synced 2026-04-21 13:37:25 +00:00
[de] fix casing rules (#11947)
* [de] fix UPPERCASE_SENTENCE_START and DE_CASE * [de] fix test fail in casing rule * [de] remove unnecessary parts in code * [de] fix bereiten misread as adjective * [de] other case fixes * [de] remove invalid parts and add new abbreviation * [de] fix parsing error
This commit is contained in:
parent
a4f9f3ac0a
commit
527552f4e7
6 changed files with 89 additions and 11 deletions
|
|
@ -5164,7 +5164,7 @@
|
|||
</rule>
|
||||
<!-- Don't split at "bla bla... yada yada" -->
|
||||
<rule break="no">
|
||||
<beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]{1,2}</beforebreak>
|
||||
<beforebreak>[\[\(]?(\.\.\.|…)[\]\)]?[\u00A0\s]{1,2}</beforebreak>
|
||||
<afterbreak>\p{Ll}</afterbreak>
|
||||
</rule>
|
||||
<!-- Don't split [.?!] when they're quoted -->
|
||||
|
|
@ -5245,7 +5245,7 @@
|
|||
</rule>
|
||||
<!-- German abbreviations -->
|
||||
<rule break="no">
|
||||
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|[Ee]lektr|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2}</beforebreak>
|
||||
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|[Ss]tv|d|Übers|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|[Ee]lektr|Dez|[Jj]gdfr|[Ee]ff|M)\.[\u00A0\s]{1,2}</beforebreak>
|
||||
<afterbreak></afterbreak>
|
||||
</rule>
|
||||
<rule break="no">
|
||||
|
|
@ -5257,6 +5257,11 @@
|
|||
<beforebreak>\b([Ee]tc)\.[\u00A0\s]{1,2}</beforebreak>
|
||||
<afterbreak>\p{Lu}</afterbreak>
|
||||
</rule>
|
||||
<!-- Don't split at "o. Ä." -->
|
||||
<rule break="no">
|
||||
<beforebreak>\bo\.[\u00A0\s]{1,2}Ä\.</beforebreak>
|
||||
<afterbreak></afterbreak>
|
||||
</rule>
|
||||
<rule break="no">
|
||||
<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
|
||||
<afterbreak></afterbreak>
|
||||
|
|
@ -5299,15 +5304,15 @@
|
|||
</rule>
|
||||
<!-- Break rules -->
|
||||
<rule break="yes">
|
||||
<beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
|
||||
<beforebreak>[\.!?][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
|
||||
<afterbreak></afterbreak>
|
||||
</rule>
|
||||
<rule break="yes">
|
||||
<beforebreak>[\.!?…]['"“\p{Pe}\u00BB\u201D]?</beforebreak>
|
||||
<beforebreak>[\.!?]['"“\p{Pe}\u00BB\u201D]?</beforebreak>
|
||||
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
||||
</rule>
|
||||
<rule break="yes">
|
||||
<beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]{1,2}</beforebreak>
|
||||
<beforebreak>[\u00A0\s]\p{L}[\.!?][\u00A0\s]{1,2}</beforebreak>
|
||||
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
||||
</rule>
|
||||
<!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
|
||||
|
|
@ -6936,9 +6941,9 @@
|
|||
<beforebreak>[\.!?]</beforebreak>
|
||||
<afterbreak>\S*@</afterbreak>
|
||||
</rule>
|
||||
<!-- Some proper nouns: Yahoo!, Mission: Impossible-->
|
||||
<!-- Some proper nouns: Yahoo!, NAU!, Mission: Impossible-->
|
||||
<rule break="no">
|
||||
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
||||
<beforebreak>(Yahoo|NAU)![\s\u00A0]</beforebreak>
|
||||
<afterbreak>\p{Ll}</afterbreak>
|
||||
</rule>
|
||||
<rule break="no">
|
||||
|
|
|
|||
|
|
@ -83,12 +83,21 @@ public class CaseRule extends Rule {
|
|||
|
||||
private static final String[] UNDEFINED_QUANTIFIERS = {"viel", "nichts", "nix", "wenig", "allerlei"};
|
||||
|
||||
// Used for subordinate clause detection after verbs (e.g. "Überlegen, wie man...")
|
||||
|
||||
private static final String[] INTERROGATIVE_PARTICLES = {"was", "wodurch", "wofür", "womit", "woran", "worauf", "woraus", "wovon", "wie"};
|
||||
|
||||
private static final String[] POSSESSIVE_INDICATORS = {"einer", "eines", "der", "des", "dieser", "dieses"};
|
||||
|
||||
private static final String[] DAS_VERB_EXCEPTIONS = {"nur", "sogar", "auch", "die", "alle", "viele", "zu"};
|
||||
|
||||
// Used for short question fragments after a colon (e.g. "Ich frage mich: Warum?")
|
||||
// Intentionally different from INTERROGATIVE_PARTICLES, which targets subordinate clauses.
|
||||
|
||||
private static final String[] COLON_QUESTION_WORDS = {"warum", "wieso", "weshalb", "wer", "was", "wann", "wo", "wie", "wozu"};
|
||||
|
||||
private static final String[] COLON_QUESTION_CONJUNCTIONS = {"und", "oder", "aber", "denn"};
|
||||
|
||||
/*
|
||||
* These are words that Morphy only knows as non-nouns (or not at all).
|
||||
* The proper solution is to add all those to our Morphy data, but as a simple
|
||||
|
|
@ -1042,10 +1051,15 @@ public class CaseRule extends Rule {
|
|||
!isInvisibleSeparator(i-1, tokens) &&
|
||||
!language.getDefaultSpellingRule().isMisspelled(lcWord)) {
|
||||
if (":".equals(tokens[i - 1].getToken())) {
|
||||
// allow short question sentences like "Warum? Und warum?" after colon
|
||||
if (isQuestionEquivalentAfterColon(i, tokens)) {
|
||||
return;
|
||||
}
|
||||
|
||||
AnalyzedTokenReadings[] subarray = new AnalyzedTokenReadings[i];
|
||||
System.arraycopy(tokens, 0, subarray, 0, i);
|
||||
|
||||
if (isVerbFollowing(i, tokens, lowercaseReadings) || getTokensWithPosTagStartingWithCount(subarray, "VER") == 0) {
|
||||
// no error
|
||||
} else {
|
||||
addRuleMatch(ruleMatches, sentence, COLON_MESSAGE, tokens[i], lcWord);
|
||||
}
|
||||
|
|
@ -1073,14 +1087,14 @@ public class CaseRule extends Rule {
|
|||
|
||||
private boolean isNounWithVerbReading(int i, AnalyzedTokenReadings[] tokens) {
|
||||
return tokens[i].hasPosTagStartingWith("SUB") &&
|
||||
tokens[i].hasPosTagStartingWith("VER:INF");
|
||||
}
|
||||
tokens[i].hasPosTagStartingWith("VER:INF");
|
||||
}
|
||||
|
||||
private boolean isInvisibleSeparator(int i, AnalyzedTokenReadings[] tokens) { // u2063 is used internally by our browser add-on
|
||||
return i >= 0 && i < tokens.length && tokens[i].getToken().length() > 0 && tokens[i].getToken().charAt(0) == '\u2063';
|
||||
}
|
||||
|
||||
private boolean isVerbFollowing(int i, AnalyzedTokenReadings[] tokens, AnalyzedTokenReadings lowercaseReadings) {
|
||||
private boolean isVerbFollowing(int i, AnalyzedTokenReadings[] tokens, AnalyzedTokenReadings lowercaseReadings) {
|
||||
AnalyzedTokenReadings[] subarray = new AnalyzedTokenReadings[ tokens.length - i ];
|
||||
System.arraycopy(tokens, i, subarray, 0, subarray.length);
|
||||
if (lowercaseReadings != null) {
|
||||
|
|
@ -1091,6 +1105,10 @@ public class CaseRule extends Rule {
|
|||
return getTokensWithPosTagStartingWithCount(subarray, "VER:") != 0;
|
||||
}
|
||||
|
||||
private boolean isColonQuestionWord(String word) {
|
||||
return StringUtils.equalsAnyIgnoreCase(word, COLON_QUESTION_WORDS);
|
||||
}
|
||||
|
||||
private void addRuleMatch(List<RuleMatch> ruleMatches, AnalyzedSentence sentence, String msg, AnalyzedTokenReadings tokenReadings, String fixedWord) {
|
||||
RuleMatch ruleMatch = new RuleMatch(this, sentence, tokenReadings.getStartPos(), tokenReadings.getEndPos(), msg);
|
||||
ruleMatch.setSuggestedReplacement(fixedWord);
|
||||
|
|
@ -1295,6 +1313,33 @@ public class CaseRule extends Rule {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the token at position i starts a short question fragment
|
||||
* that forms a valid Satzäquivalent after a colon, per Duden rules.
|
||||
* Handles:
|
||||
* single word: "Ich frage mich: Warum?"
|
||||
* with conjunction: "Ich frage mich: Und warum?"
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean isQuestionEquivalentAfterColon(int i, AnalyzedTokenReadings[] tokens) {
|
||||
if (i < tokens.length - 1) {
|
||||
String word = tokens[i].getToken();
|
||||
String next = tokens[i + 1].getToken();
|
||||
// "Warum?"
|
||||
if (isColonQuestionWord(word) && "?".equals(next)) {
|
||||
return true;
|
||||
}
|
||||
// "Und warum?"
|
||||
if (StringUtils.equalsAnyIgnoreCase(word, COLON_QUESTION_CONJUNCTIONS)
|
||||
&& i < tokens.length - 2
|
||||
&& isColonQuestionWord(tokens[i + 1].getToken())
|
||||
&& "?".equals(tokens[i + 2].getToken())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private AnalyzedTokenReadings lookup(String word) {
|
||||
try {
|
||||
return ((GermanTagger) language.getTagger()).lookup(word);
|
||||
|
|
@ -1303,3 +1348,4 @@ public class CaseRule extends Rule {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ Alpine[ns] Museums? (der|Münchens?)
|
|||
Agile Coach(e?|ing)s?
|
||||
Amnesty Internationals?
|
||||
Andreas Scheuers?
|
||||
Ankleiden?
|
||||
Anlage Sonstiges
|
||||
Angewandte[mnr]? Kunst
|
||||
Angewandte[mnr]? [A-ZÖÄÜ][a-zäöü]+(wissenschaft|forschung)(en)?
|
||||
|
|
@ -59,6 +60,7 @@ Angewandte[mnr]? (Optik|Informatik|Ethik|Geophysik|Physik|Makroökonomie|Linguis
|
|||
Anne Will
|
||||
Anarchistischen? Pogo-Partei
|
||||
Anthony Blinkens?
|
||||
Annies?
|
||||
Apokalyptische[nrm]? Reiters?
|
||||
Apostolische[nrm]? Palast(s|es)?
|
||||
Aqua Vitals?
|
||||
|
|
@ -195,6 +197,7 @@ Director Legal
|
|||
Director of Legal
|
||||
Die Toten Hosen
|
||||
Die Leiden des jungen Werthers
|
||||
Die Mumie
|
||||
[Dd]ie Schöne und das Biest
|
||||
Digital Assets?|IQ|Garage|Pushs?|Traces?|Pushing|Cockpits?|Ads?|Signals?|Designs?|Designer[ns]?|Engineerings?|Innovation|Media|Content|Community|Detox|Analyst|Leader[ns]?|Consultants?|Equipments?|Onboarding|Transformations?|Learnings?|Events?|Hubs?|Masterings?|Marketers?|Radios?|Experiences?|Kits?|Wallets?|Turbines?|Workspaces?|Leaderships?|Profiles?|News?|Managements?|Economy|Summits?|Fitness
|
||||
Digital Film Productions?
|
||||
|
|
@ -549,11 +552,13 @@ Kaukasische[nsm] Eichhörnchens?
|
|||
Kubanischen? Revolution
|
||||
kaum Neues
|
||||
Kessel Buntes
|
||||
KfW(-\d+)?(-[A-ZÄÖÜa-zäöüß]+)?
|
||||
Kitchen Impossible
|
||||
Klein (Nordendes?|Borstels?)
|
||||
Kleine[rnm]? Sauerampfers?
|
||||
Kleinen? Weser|Donau
|
||||
Knollige[rnm]? Hahnenfu(ß|ss)
|
||||
Kommunale[nms]? Integrationszentrum
|
||||
Lost (Places?|Islands?)
|
||||
Lost and Found
|
||||
Landesverband(e?s)? Baye?rischer? Omnibusunternehmen
|
||||
|
|
@ -772,6 +777,7 @@ Polar 8
|
|||
Potemkinschen? Dörfern?
|
||||
Politische[rn]? Direktors?
|
||||
Polytechnischen? (.*schule|Instituts?)
|
||||
Polytechnische[nr]? Universität
|
||||
Polnische[rnm]? (Korridors?|Fremdenverkehrsamt(e?s)?)
|
||||
Praktische[mns]? Jahr
|
||||
Present Progressive
|
||||
|
|
@ -782,6 +788,7 @@ Prima Terras?
|
|||
Product[/-]Market Fit
|
||||
Progressive Web Apps?
|
||||
Progressive Rocks?|Metals?
|
||||
ProMinent Dosiertechnik AG
|
||||
Psychologie Heute
|
||||
Psychologische[nsm]? Institute?s?
|
||||
Psychiatrische[rn]? Klinik ([A-ZÖÄÜ][a-zäöüß\-]+)
|
||||
|
|
@ -847,6 +854,7 @@ Schiefe[nr]? Turms?
|
|||
Schiefen? Ebene
|
||||
Schiefen? Schlachtordnung
|
||||
Schlesische[rnm]? Kriege?s?
|
||||
Schlesische[rn]? Universität
|
||||
Schlesische[snm]? Tors?
|
||||
Schnellen? Eingreiftruppe
|
||||
Schön Klinik
|
||||
|
|
@ -866,6 +874,7 @@ Schweizerischen? Konferenz
|
|||
Schweizerische[nrm]? Verband für
|
||||
Schweizerische[nrm]? Gehörlosenbund
|
||||
Schweizerische[nrm]? Fu(ss|ß)ballverband
|
||||
Schweizerische[rn]? Trassenvergabestelle(-[A-ZÄÖÜ][a-zäöüß]+)?
|
||||
Schweizer Illustrierten?
|
||||
Schweizerischen? Akademischen? Gesellschaft für Germanistik
|
||||
Seiner? Exzellenz
|
||||
|
|
@ -956,6 +965,7 @@ Theologischen? Fakultät
|
|||
Thüringischen? Landeszeitung
|
||||
Thüringer Allgemeinen?
|
||||
Thorsten Lahm
|
||||
THYMUSKIN Sensitive Shampoo
|
||||
Tiger Global
|
||||
Tote[nsm] Meer
|
||||
Total Gross|Values?|Blocking
|
||||
|
|
@ -1011,6 +1021,7 @@ Wei(ß|ss)e[nrm]? Zwerg(e|s|en|es)?|Sonntags?|Tod|Rings?|Burgunder[ns]?
|
|||
(WELT|Welt) Digital
|
||||
Wenn und Aber
|
||||
Werd Verlag
|
||||
Western Union International Bank
|
||||
Westliche[nrm]? Gorillas?
|
||||
Westfälische[nrm]? Friede(ns?)?|Pforte|Bucht|Nachrichten|Wilhelms-Universität|Hochschule|Anzeigers?
|
||||
Wilde[nrm]? Westens?|Kaisers?|Berge?s?
|
||||
|
|
|
|||
|
|
@ -1327,6 +1327,11 @@ Copyright © 2013 Markus Brenneis, Daniel Naber, Jan Schreiber
|
|||
<token postag="VER:1:PLU.*" postag_regexp="yes" />
|
||||
<token postag="SUB:.*" postag_regexp="yes" />
|
||||
</antipattern>
|
||||
<antipattern><!-- die Höhe bereitet/bereiten Kopfzerbrechen -->
|
||||
<token postag="SUB:.*" postag_regexp="yes"/>
|
||||
<token regexp="yes">bereitet?|bereiten</token>
|
||||
<token postag="SUB:.*" postag_regexp="yes"/>
|
||||
</antipattern>
|
||||
<pattern>
|
||||
<unify>
|
||||
<feature id="number"/><feature id="case"/><feature id="gender"/>
|
||||
|
|
|
|||
|
|
@ -383,6 +383,14 @@ public class CaseRuleTest {
|
|||
assertBad("Das ist es: Kein Satz.");
|
||||
assertBad("Wen magst du lieber: Die Giants oder die Dragons?");
|
||||
|
||||
assertGood("Ich frage mich: Warum?");
|
||||
assertGood("Ich frage mich: Wieso?");
|
||||
assertGood("Ich frage mich: Weshalb?");
|
||||
assertGood("Ich frage mich: Und warum?");
|
||||
assertGood("Ich frage mich: Oder wieso?");
|
||||
assertGood("Ich frage mich: Aber warum?");
|
||||
assertBad("Ich frage mich: Warum Das so ist.");
|
||||
|
||||
assertGood("Das wirklich Wichtige ist dies:");
|
||||
assertGood("Das wirklich wichtige Verfahren ist dies:");
|
||||
//assertBad("Das wirklich wichtige ist dies:");
|
||||
|
|
|
|||
|
|
@ -79,6 +79,9 @@ public class UppercaseSentenceStartRuleTest {
|
|||
assertEquals(1, matches3.size());
|
||||
assertEquals(19, matches3.get(0).getFromPos());
|
||||
assertEquals(22, matches3.get(0).getToPos());
|
||||
|
||||
assertEquals(0, lt.check("Willkommen… im Berlin.").size());
|
||||
assertEquals(0, lt.check("Die neue Kollektion von NAU! ist jetzt online.").size());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue