diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index a5eb42f..359751c 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -48,10 +48,11 @@ def is_mark(codepoint): return codepoint.general_category in ("Mn", "Me", "Mc") def is_letter_with_marks(codepoint, table): - """Returns true for plain letters combined with one or more marks.""" + """Returns true for letters combined with one or more marks.""" # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values return len(codepoint.combining_ids) > 1 and \ - is_plain_letter(table[codepoint.combining_ids[0]]) and \ + (is_plain_letter(table[codepoint.combining_ids[0]]) or\ + is_letter_with_marks(table[codepoint.combining_ids[0]],table)) and \ all(is_mark(table[i]) for i in codepoint.combining_ids[1:]) def is_letter(codepoint, table): @@ -61,11 +62,17 @@ def is_letter(codepoint, table): def get_plain_letter(codepoint, table): """Return the base codepoint without marks.""" if is_letter_with_marks(codepoint, table): - return table[codepoint.combining_ids[0]] + if len(table[codepoint.combining_ids[0]].combining_ids) > 1: + # Recursive to find the plain letter + return get_plain_letter(table[codepoint.combining_ids[0]],table) + elif is_plain_letter(table[codepoint.combining_ids[0]]): + return table[codepoint.combining_ids[0]] + else: + return None elif is_plain_letter(codepoint): return codepoint else: - raise "mu" + return None def is_ligature(codepoint, table): """Return true for letters combined with letters.""" @@ -160,7 +167,8 @@ def main(args): if codepoint.general_category.startswith('L') and \ len(codepoint.combining_ids) > 1: if is_letter_with_marks(codepoint, table): - charactersSet.add((codepoint.id, + if get_plain_letter(codepoint, table) <> None: + charactersSet.add((codepoint.id, chr(get_plain_letter(codepoint, table).id))) elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): charactersSet.add((codepoint.id, diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 84886da..97f9ed4 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -254,6 +254,18 @@ ǒ o Ǔ U ǔ u +Ǖ U +ǖ u +Ǘ U +ǘ u +Ǚ U +ǚ u +Ǜ U +ǜ u +Ǟ A +ǟ a +Ǡ A +ǡ a Ǥ G ǥ g Ǧ G @@ -262,6 +274,8 @@ ǩ k Ǫ O ǫ o +Ǭ O +ǭ o ǰ j DZ DZ Dz Dz @@ -270,6 +284,8 @@ ǵ g Ǹ N ǹ n +Ǻ A +ǻ a Ȁ A ȁ a Ȃ A @@ -307,8 +323,14 @@ ȧ a Ȩ E ȩ e +Ȫ O +ȫ o +Ȭ O +ȭ o Ȯ O ȯ o +Ȱ O +ȱ o Ȳ Y ȳ y ȴ l @@ -441,6 +463,8 @@ ḅ b Ḇ B ḇ b +Ḉ C +ḉ c Ḋ D ḋ d Ḍ D @@ -451,10 +475,16 @@ ḑ d Ḓ D ḓ d +Ḕ E +ḕ e +Ḗ E +ḗ e Ḙ E ḙ e Ḛ E ḛ e +Ḝ E +ḝ e Ḟ F ḟ f Ḡ G @@ -471,6 +501,8 @@ ḫ h Ḭ I ḭ i +Ḯ I +ḯ i Ḱ K ḱ k Ḳ K @@ -479,6 +511,8 @@ ḵ k Ḷ L ḷ l +Ḹ L +ḹ l Ḻ L ḻ l Ḽ L @@ -497,6 +531,14 @@ ṉ n Ṋ N ṋ n +Ṍ O +ṍ o +Ṏ O +ṏ o +Ṑ O +ṑ o +Ṓ O +ṓ o Ṕ P ṕ p Ṗ P @@ -505,12 +547,20 @@ ṙ r Ṛ R ṛ r +Ṝ R +ṝ r Ṟ R ṟ r Ṡ S ṡ s Ṣ S ṣ s +Ṥ S +ṥ s +Ṧ S +ṧ s +Ṩ S +ṩ s Ṫ T ṫ t Ṭ T @@ -525,6 +575,10 @@ ṵ u Ṷ U ṷ u +Ṹ U +ṹ u +Ṻ U +ṻ u Ṽ V ṽ v Ṿ V @@ -563,12 +617,42 @@ ạ a Ả A ả a +Ấ A +ấ a +Ầ A +ầ a +Ẩ A +ẩ a +Ẫ A +ẫ a +Ậ A +ậ a +Ắ A +ắ a +Ằ A +ằ a +Ẳ A +ẳ a +Ẵ A +ẵ a +Ặ A +ặ a Ẹ E ẹ e Ẻ E ẻ e Ẽ E ẽ e +Ế E +ế e +Ề E +ề e +Ể E +ể e +Ễ E +ễ e +Ệ E +ệ e Ỉ I ỉ i Ị I @@ -577,10 +661,40 @@ ọ o Ỏ O ỏ o +Ố O +ố o +Ồ O +ồ o +Ổ O +ổ o +Ỗ O +ỗ o +Ộ O +ộ o +Ớ O +ớ o +Ờ O +ờ o +Ở O +ở o +Ỡ O +ỡ o +Ợ O +ợ o Ụ U ụ u Ủ U ủ u +Ứ U +ứ u +Ừ U +ừ u +Ử U +ử u +Ữ U +ữ u +Ự U +ự u Ỳ Y ỳ y Ỵ Y