[Maposmatic-dev] [PATCH] Specical case for converti ng street names enti

maposmatic-dev

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Maposmatic-dev] [PATCH] Specical case for converti ng street names enti

From:	Konstantin Mochalov
Subject:	[Maposmatic-dev] [PATCH] Specical case for converti ng street names entirely of status parts, i.e. ' Набережная улица'; more tes ts, renamed some variables and added comments.
Date:	Fri, 18 May 2012 10:53:33 +0400

---
 ocitysmap/i18n.py      |   21 +++++++++++++++------
 ocitysmap/i18n_test.py |   34 +++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/ocitysmap/i18n.py b/ocitysmap/i18n.py
index 3521b64..56e0196 100644
--- a/ocitysmap/i18n.py
+++ b/ocitysmap/i18n.py
@@ -496,14 +496,19 @@ class i18n_ru_generic(i18n):
         (u"квартал", [u"кв-л", u"кв"]),
     ]
 
+    # matches one or more spaces
     SPACE_REDUCE = re.compile(r"\s+")
-    STATUS_PARTS_MAPPING = dict((f, t) for t, ff in STATUS_PARTS for f in ff)
-    STATUS_REGEXP = re.compile(r"\b(%s)\.?(?=\W|$)" % u"|".join(
+    # mapping from status abbreviations (w/o '.') to full status names
+    STATUS_PARTS_ABBREV_MAPPING = dict((f, t) for t, ff in STATUS_PARTS for f 
in ff)
+    # set of full (not abbreviated) status parts
+    STATUS_PARTS_FULL = set((x[0] for x in STATUS_PARTS))
+    # matches any abbreviated status part with optional '.'
+    STATUS_ABBREV_REGEXP = re.compile(r"\b(%s)\.?(?=\W|$)" % u"|".join(
         f for t, ff in STATUS_PARTS for f in ff), re.IGNORECASE | re.UNICODE)
+    # matches status prefixes at start of name used to move prefixes to the end
     PREFIX_REGEXP = re.compile(
         
ur"^(?P<num_prefix>\d+-?(ы?й|я))?\s*(?P<prefix>(%s)\.?)?\s*(?P<name>.+)?" %
         (u"|".join(f for f,t in STATUS_PARTS)), re.IGNORECASE | re.UNICODE)
-    STARTING_NUMBER_REGEXP = 
re.compile(ur"^(?P<prefix>\d+-?(ы?й|я))\s+(?P<name>.+)")
 
     def __init__(self, language, locale_path):
         self.language = str(language)
@@ -519,7 +524,11 @@ class i18n_ru_generic(i18n):
 
     @staticmethod
     def _rewrite_street_parts(matches):
-        if matches.group('num_prefix') is None and matches.group('prefix') is 
None:
+        if (matches.group('num_prefix') is None and
+            matches.group('prefix') is not None and
+            matches.group('name') in i18n_ru_generic.STATUS_PARTS_FULL):
+            return matches.group(0)
+        elif matches.group('num_prefix') is None and matches.group('prefix') 
is None:
             return matches.group(0)
         elif matches.group('name') is None:
             return matches.group(0)
@@ -535,8 +544,8 @@ class i18n_ru_generic(i18n):
         name = name.strip()
         name = self.SPACE_REDUCE.sub(" ", name)
         # Normalize abbreviations
-        name = self.STATUS_REGEXP.sub(lambda m:
-                self.STATUS_PARTS_MAPPING.get(
+        name = self.STATUS_ABBREV_REGEXP.sub(lambda m:
+                self.STATUS_PARTS_ABBREV_MAPPING.get(
                     m.group(0).replace('.', ''), m.group(0)),
             name)
         # Move prefixed status parts to the end for sorting
diff --git a/ocitysmap/i18n_test.py b/ocitysmap/i18n_test.py
index 22db1e9..153f467 100644
--- a/ocitysmap/i18n_test.py
+++ b/ocitysmap/i18n_test.py
@@ -26,12 +26,44 @@ class i18n_ru_generic_test(unittest.TestCase):
             (u"10-я Текстильная улица", u"Текстильная улица, 10-я"),
             (u"11-я линия В.О.", u"В.О., линия 11-я"),
 
-            # Not yet implemented:
+            (u'Сосновая улица', u'Сосновая улица'),
+            (u'1-й Спортивный проезд', u'Спортивный проезд, 1-й'),
+            (u'1-й Коммунальный проезд', u'Коммунальный проезд, 1-й'),
+            (u'Красивый переулок', u'Красивый переулок'),
+            (u'Больничная улица', u'Больничная улица'),
+            (u'Улица Берёзово', u'Берёзово, улица'),
+            (u'улица 8 Марта', u'8 Марта, улица'),
+            (u'улица Алёнкино', u'Алёнкино, улица'),
+            (u'улица 222-го Артполка', u'222-го Артполка, улица'),
+            (u'Вятка', u'Вятка'),
+            (u'улица Воинов-Интернационалистов', u'Воинов-Интернационалистов, 
улица'),
+            (u'переулок Юности', u'Юности, переулок'),
+            (u'Арматурная улица', u'Арматурная улица'),
+            (u'Пикалов мост', u'Пикалов мост'),
+            (u'Подьяческий мост', u'Подьяческий мост'),
+            (u'Лесопильный мост', u'Лесопильный мост'),
+            (u'набережная Лебяжьей канавки', u'Лебяжьей канавки, набережная'),
+            (u'Мало-Калинкин мост', u'Мало-Калинкин мост'),
+            (u'Площадь Академика Лихачёва', u'Академика Лихачёва, площадь'),
+            (u'площадь Академика Лихачёва', u'Академика Лихачёва, площадь'),
+            (u'9-я линия В.О.', u'В.О., линия 9-я'),
+            (u'открытию регулярного трамвайного движения в Санкт-Петербурге',
+                u'открытию регулярного трамвайного движения в 
Санкт-Петербурге'),
+            (u'Нефтяная дорога', u'Нефтяная дорога'),
+            (u'8-9-я линии', u'8-9-я линии'),
+
+            # Not yet implemented, no doubt how to convert name right,
+            # there can be different conventions:
             #(u'Малая Пушкарская улица', u'Пушкарская малая, улица'),
+            #(u'набережная реки Мойки', u'Мойки, реки набережная'),
+            #(u'Большой проспект П.С', u'?'),
 
             # special case - no name, only prefix
             (u"10-я аллея", u"10-я аллея"),
             (u"123-я улица", u"123-я улица"),
+
+            # name can be classified as status part
+            (u'Набережная улица', u'Набережная улица'),
         ]
         for fr, to in conversions:
             self.assertEqual(to, self.r.user_readable_street(fr))
-- 
1.7.6.msysgit.0

[Prev in Thread]

Current Thread

[Next in Thread]

[Maposmatic-dev] [PATCH] Russian street name normal izer: i18n_ru_generic.user_readable_street , using table of status parts from streetmangler proje ct, Konstantin Mochalov, 2012/05/11
- Re: [Maposmatic-dev] [PATCH] Russian street name normalizer: i18n_ru_generic.user_readable_street, using table of status parts from streetmangler project, David MENTRE, 2012/05/17
- Re: [Maposmatic-dev] [PATCH] Russian street name normalizer: i18n_ru_generic.user_readable_street, using table of status parts from streetmangler project, Thomas Petazzoni, 2012/05/17
  - Re: [Maposmatic-dev] [PATCH] Russian street name normalizer: i18n_ru_generic.user_readable_street, using table of status parts from streetmangler project, Konstantin Mochalov, 2012/05/18
  - [Maposmatic-dev] [PATCH] Specical case for converti ng street names entirely of status parts, i.e. ' Набережная улица'; more tes ts, renamed some variables and added comments., Konstantin Mochalov <=
    - Re: [Maposmatic-dev] [PATCH] Specical case for converti ng street names entirely of status parts, i.e. 'Н абережная улица'; more tests, r enamed some variables and added comments., Thomas Petazzoni, 2012/05/18