[de] fix casing rules (#11947)

* [de] fix UPPERCASE_SENTENCE_START and DE_CASE

* [de] fix test fail in casing rule

* [de] remove unnecessary parts in code

* [de] fix bereiten misread as adjective

* [de] other case fixes

* [de] remove invalid parts and add new abbreviation

* [de] fix parsing error
This commit is contained in:
Anna Rusalkina 2026-04-17 18:05:12 +02:00 committed by GitHub
parent a4f9f3ac0a
commit 527552f4e7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 89 additions and 11 deletions

View file

@ -5164,7 +5164,7 @@
</rule>
<!-- Don't split at "bla bla... yada yada" -->
<rule break="no">
<beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]{1,2}</beforebreak>
<beforebreak>[\[\(]?(\.\.\.|…)[\]\)]?[\u00A0\s]{1,2}</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- Don't split [.?!] when they're quoted -->
@ -5245,7 +5245,7 @@
</rule>
<!-- German abbreviations -->
<rule break="no">
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|[Ee]lektr|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2}</beforebreak>
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|[Ss]tv|d|Übers|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|[Ee]lektr|Dez|[Jj]gdfr|[Ee]ff|M)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
@ -5257,6 +5257,11 @@
<beforebreak>\b([Ee]tc)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak>\p{Lu}</afterbreak>
</rule>
<!-- Don't split at "o. Ä." -->
<rule break="no">
<beforebreak>\bo\.[\u00A0\s]{1,2}Ä\.</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
<afterbreak></afterbreak>
@ -5299,15 +5304,15 @@
</rule>
<!-- Break rules -->
<rule break="yes">
<beforebreak>[\.!?][\u0002|'|"|“|«||\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
<beforebreak>[\.!?][\u0002|'|"|“|«||\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="yes">
<beforebreak>[\.!?]['"“\p{Pe}\u00BB\u201D]?</beforebreak>
<beforebreak>[\.!?]['"“\p{Pe}\u00BB\u201D]?</beforebreak>
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
</rule>
<rule break="yes">
<beforebreak>[\u00A0\s]\p{L}[\.!?][\u00A0\s]{1,2}</beforebreak>
<beforebreak>[\u00A0\s]\p{L}[\.!?][\u00A0\s]{1,2}</beforebreak>
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
<!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
@ -6936,9 +6941,9 @@
<beforebreak>[\.!?]</beforebreak>
<afterbreak>\S*@</afterbreak>
</rule>
<!-- Some proper nouns: Yahoo!, Mission: Impossible-->
<!-- Some proper nouns: Yahoo!, NAU!, Mission: Impossible-->
<rule break="no">
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
<beforebreak>(Yahoo|NAU)![\s\u00A0]</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<rule break="no">

View file

@ -83,12 +83,21 @@ public class CaseRule extends Rule {
private static final String[] UNDEFINED_QUANTIFIERS = {"viel", "nichts", "nix", "wenig", "allerlei"};
// Used for subordinate clause detection after verbs (e.g. "Überlegen, wie man...")
private static final String[] INTERROGATIVE_PARTICLES = {"was", "wodurch", "wofür", "womit", "woran", "worauf", "woraus", "wovon", "wie"};
private static final String[] POSSESSIVE_INDICATORS = {"einer", "eines", "der", "des", "dieser", "dieses"};
private static final String[] DAS_VERB_EXCEPTIONS = {"nur", "sogar", "auch", "die", "alle", "viele", "zu"};
// Used for short question fragments after a colon (e.g. "Ich frage mich: Warum?")
// Intentionally different from INTERROGATIVE_PARTICLES, which targets subordinate clauses.
private static final String[] COLON_QUESTION_WORDS = {"warum", "wieso", "weshalb", "wer", "was", "wann", "wo", "wie", "wozu"};
private static final String[] COLON_QUESTION_CONJUNCTIONS = {"und", "oder", "aber", "denn"};
/*
* These are words that Morphy only knows as non-nouns (or not at all).
* The proper solution is to add all those to our Morphy data, but as a simple
@ -1042,10 +1051,15 @@ public class CaseRule extends Rule {
!isInvisibleSeparator(i-1, tokens) &&
!language.getDefaultSpellingRule().isMisspelled(lcWord)) {
if (":".equals(tokens[i - 1].getToken())) {
// allow short question sentences like "Warum? Und warum?" after colon
if (isQuestionEquivalentAfterColon(i, tokens)) {
return;
}
AnalyzedTokenReadings[] subarray = new AnalyzedTokenReadings[i];
System.arraycopy(tokens, 0, subarray, 0, i);
if (isVerbFollowing(i, tokens, lowercaseReadings) || getTokensWithPosTagStartingWithCount(subarray, "VER") == 0) {
// no error
} else {
addRuleMatch(ruleMatches, sentence, COLON_MESSAGE, tokens[i], lcWord);
}
@ -1073,14 +1087,14 @@ public class CaseRule extends Rule {
private boolean isNounWithVerbReading(int i, AnalyzedTokenReadings[] tokens) {
return tokens[i].hasPosTagStartingWith("SUB") &&
tokens[i].hasPosTagStartingWith("VER:INF");
}
tokens[i].hasPosTagStartingWith("VER:INF");
}
private boolean isInvisibleSeparator(int i, AnalyzedTokenReadings[] tokens) { // u2063 is used internally by our browser add-on
return i >= 0 && i < tokens.length && tokens[i].getToken().length() > 0 && tokens[i].getToken().charAt(0) == '\u2063';
}
private boolean isVerbFollowing(int i, AnalyzedTokenReadings[] tokens, AnalyzedTokenReadings lowercaseReadings) {
private boolean isVerbFollowing(int i, AnalyzedTokenReadings[] tokens, AnalyzedTokenReadings lowercaseReadings) {
AnalyzedTokenReadings[] subarray = new AnalyzedTokenReadings[ tokens.length - i ];
System.arraycopy(tokens, i, subarray, 0, subarray.length);
if (lowercaseReadings != null) {
@ -1091,6 +1105,10 @@ public class CaseRule extends Rule {
return getTokensWithPosTagStartingWithCount(subarray, "VER:") != 0;
}
private boolean isColonQuestionWord(String word) {
return StringUtils.equalsAnyIgnoreCase(word, COLON_QUESTION_WORDS);
}
private void addRuleMatch(List<RuleMatch> ruleMatches, AnalyzedSentence sentence, String msg, AnalyzedTokenReadings tokenReadings, String fixedWord) {
RuleMatch ruleMatch = new RuleMatch(this, sentence, tokenReadings.getStartPos(), tokenReadings.getEndPos(), msg);
ruleMatch.setSuggestedReplacement(fixedWord);
@ -1295,6 +1313,33 @@ public class CaseRule extends Rule {
return true;
}
/**
* Returns true if the token at position i starts a short question fragment
* that forms a valid Satzäquivalent after a colon, per Duden rules.
* Handles:
* single word: "Ich frage mich: Warum?"
* with conjunction: "Ich frage mich: Und warum?"
*/
@VisibleForTesting
boolean isQuestionEquivalentAfterColon(int i, AnalyzedTokenReadings[] tokens) {
if (i < tokens.length - 1) {
String word = tokens[i].getToken();
String next = tokens[i + 1].getToken();
// "Warum?"
if (isColonQuestionWord(word) && "?".equals(next)) {
return true;
}
// "Und warum?"
if (StringUtils.equalsAnyIgnoreCase(word, COLON_QUESTION_CONJUNCTIONS)
&& i < tokens.length - 2
&& isColonQuestionWord(tokens[i + 1].getToken())
&& "?".equals(tokens[i + 2].getToken())) {
return true;
}
}
return false;
}
private AnalyzedTokenReadings lookup(String word) {
try {
return ((GermanTagger) language.getTagger()).lookup(word);
@ -1303,3 +1348,4 @@ public class CaseRule extends Rule {
}
}
}

View file

@ -50,6 +50,7 @@ Alpine[ns] Museums? (der|Münchens?)
Agile Coach(e?|ing)s?
Amnesty Internationals?
Andreas Scheuers?
Ankleiden?
Anlage Sonstiges
Angewandte[mnr]? Kunst
Angewandte[mnr]? [A-ZÖÄÜ][a-zäöü]+(wissenschaft|forschung)(en)?
@ -59,6 +60,7 @@ Angewandte[mnr]? (Optik|Informatik|Ethik|Geophysik|Physik|Makroökonomie|Linguis
Anne Will
Anarchistischen? Pogo-Partei
Anthony Blinkens?
Annies?
Apokalyptische[nrm]? Reiters?
Apostolische[nrm]? Palast(s|es)?
Aqua Vitals?
@ -195,6 +197,7 @@ Director Legal
Director of Legal
Die Toten Hosen
Die Leiden des jungen Werthers
Die Mumie
[Dd]ie Schöne und das Biest
Digital Assets?|IQ|Garage|Pushs?|Traces?|Pushing|Cockpits?|Ads?|Signals?|Designs?|Designer[ns]?|Engineerings?|Innovation|Media|Content|Community|Detox|Analyst|Leader[ns]?|Consultants?|Equipments?|Onboarding|Transformations?|Learnings?|Events?|Hubs?|Masterings?|Marketers?|Radios?|Experiences?|Kits?|Wallets?|Turbines?|Workspaces?|Leaderships?|Profiles?|News?|Managements?|Economy|Summits?|Fitness
Digital Film Productions?
@ -549,11 +552,13 @@ Kaukasische[nsm] Eichhörnchens?
Kubanischen? Revolution
kaum Neues
Kessel Buntes
KfW(-\d+)?(-[A-ZÄÖÜa-zäöüß]+)?
Kitchen Impossible
Klein (Nordendes?|Borstels?)
Kleine[rnm]? Sauerampfers?
Kleinen? Weser|Donau
Knollige[rnm]? Hahnenfu(ß|ss)
Kommunale[nms]? Integrationszentrum
Lost (Places?|Islands?)
Lost and Found
Landesverband(e?s)? Baye?rischer? Omnibusunternehmen
@ -772,6 +777,7 @@ Polar 8
Potemkinschen? Dörfern?
Politische[rn]? Direktors?
Polytechnischen? (.*schule|Instituts?)
Polytechnische[nr]? Universität
Polnische[rnm]? (Korridors?|Fremdenverkehrsamt(e?s)?)
Praktische[mns]? Jahr
Present Progressive
@ -782,6 +788,7 @@ Prima Terras?
Product[/-]Market Fit
Progressive Web Apps?
Progressive Rocks?|Metals?
ProMinent Dosiertechnik AG
Psychologie Heute
Psychologische[nsm]? Institute?s?
Psychiatrische[rn]? Klinik ([A-ZÖÄÜ][a-zäöüß\-]+)
@ -847,6 +854,7 @@ Schiefe[nr]? Turms?
Schiefen? Ebene
Schiefen? Schlachtordnung
Schlesische[rnm]? Kriege?s?
Schlesische[rn]? Universität
Schlesische[snm]? Tors?
Schnellen? Eingreiftruppe
Schön Klinik
@ -866,6 +874,7 @@ Schweizerischen? Konferenz
Schweizerische[nrm]? Verband für
Schweizerische[nrm]? Gehörlosenbund
Schweizerische[nrm]? Fu(ss|ß)ballverband
Schweizerische[rn]? Trassenvergabestelle(-[A-ZÄÖÜ][a-zäöüß]+)?
Schweizer Illustrierten?
Schweizerischen? Akademischen? Gesellschaft für Germanistik
Seiner? Exzellenz
@ -956,6 +965,7 @@ Theologischen? Fakultät
Thüringischen? Landeszeitung
Thüringer Allgemeinen?
Thorsten Lahm
THYMUSKIN Sensitive Shampoo
Tiger Global
Tote[nsm] Meer
Total Gross|Values?|Blocking
@ -1011,6 +1021,7 @@ Wei(ß|ss)e[nrm]? Zwerg(e|s|en|es)?|Sonntags?|Tod|Rings?|Burgunder[ns]?
(WELT|Welt) Digital
Wenn und Aber
Werd Verlag
Western Union International Bank
Westliche[nrm]? Gorillas?
Westfälische[nrm]? Friede(ns?)?|Pforte|Bucht|Nachrichten|Wilhelms-Universität|Hochschule|Anzeigers?
Wilde[nrm]? Westens?|Kaisers?|Berge?s?

View file

@ -1327,6 +1327,11 @@ Copyright © 2013 Markus Brenneis, Daniel Naber, Jan Schreiber
<token postag="VER:1:PLU.*" postag_regexp="yes" />
<token postag="SUB:.*" postag_regexp="yes" />
</antipattern>
<antipattern><!-- die Höhe bereitet/bereiten Kopfzerbrechen -->
<token postag="SUB:.*" postag_regexp="yes"/>
<token regexp="yes">bereitet?|bereiten</token>
<token postag="SUB:.*" postag_regexp="yes"/>
</antipattern>
<pattern>
<unify>
<feature id="number"/><feature id="case"/><feature id="gender"/>

View file

@ -383,6 +383,14 @@ public class CaseRuleTest {
assertBad("Das ist es: Kein Satz.");
assertBad("Wen magst du lieber: Die Giants oder die Dragons?");
assertGood("Ich frage mich: Warum?");
assertGood("Ich frage mich: Wieso?");
assertGood("Ich frage mich: Weshalb?");
assertGood("Ich frage mich: Und warum?");
assertGood("Ich frage mich: Oder wieso?");
assertGood("Ich frage mich: Aber warum?");
assertBad("Ich frage mich: Warum Das so ist.");
assertGood("Das wirklich Wichtige ist dies:");
assertGood("Das wirklich wichtige Verfahren ist dies:");
//assertBad("Das wirklich wichtige ist dies:");

View file

@ -79,6 +79,9 @@ public class UppercaseSentenceStartRuleTest {
assertEquals(1, matches3.size());
assertEquals(19, matches3.get(0).getFromPos());
assertEquals(22, matches3.get(0).getToPos());
assertEquals(0, lt.check("Willkommen… im Berlin.").size());
assertEquals(0, lt.check("Die neue Kollektion von NAU! ist jetzt online.").size());
}
}