Address review remarks in unicode.py

2019-06-10 21:13:01 +02:00 · 2019-06-10 21:13:01 +02:00 · 2b47a085dd
commit 2b47a085dd
parent 60ccf89693
1 changed files with 61 additions and 55 deletions
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@ -34,7 +34,7 @@ except ImportError:
    from StringIO import StringIO

 try:
-    # completely optional type hinting
+    # Completely optional type hinting
    # (Python 2 compatible using comments,
    # see: https://mypy.readthedocs.io/en/latest/python2.html)
    # This is very helpful in typing-aware IDE like PyCharm.
@ -43,9 +43,9 @@ except ImportError:
    pass


-# we don't use enum.Enum because of Python 2.7 compatibility
+# We don't use enum.Enum because of Python 2.7 compatibility.
 class UnicodeFiles(object):
-    # ReadMe does not contain any unicode data, we
+    # ReadMe does not contain any Unicode data, we
    # only use it to extract versions.
    README = "ReadMe.txt"

@ -57,11 +57,15 @@ class UnicodeFiles(object):
    UNICODE_DATA = "UnicodeData.txt"


-UnicodeFiles.ALL_FILES = tuple(
-    getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
+# The order doesn't really matter (Python < 3.6 won't preserve it),
+# we only want to aggregate all the file names.
+ALL_UNICODE_FILES = tuple(
+    value for name, value in UnicodeFiles.__dict__.items()
    if not name.startswith("_")
 )

+assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
+
 # The directory this file is located in.
 THIS_DIR = os.path.dirname(os.path.realpath(__file__))

@ -97,18 +101,17 @@ EXPANDED_CATEGORIES = {

 # This is the (inclusive) range of surrogate codepoints.
 # These are not valid Rust characters.
-# - they are not valid Rust characters
 SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)

 UnicodeData = namedtuple(
    "UnicodeData", (
-        # conversions:
+        # Conversions:
        "to_upper", "to_lower", "to_title",

-        # decompositions: canonical decompositions, compatibility decomp
+        # Decompositions: canonical decompositions, compatibility decomp
        "canon_decomp", "compat_decomp",

-        # grouped: general categories and combining characters
+        # Grouped: general categories and combining characters
        "general_categories", "combines",
    )
 )
@ -136,10 +139,10 @@ def fetch_files(version=None):
        return have_version

    if version:
-        # check if the desired version exists on the server
+        # Check if the desired version exists on the server.
        get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
    else:
-        # extract the latest version
+        # Extract the latest version.
        get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)

    readme_url = get_fetch_url(UnicodeFiles.README)
@ -153,14 +156,14 @@ def fetch_files(version=None):

    download_dir = get_unicode_dir(unicode_version)
    if not os.path.exists(download_dir):
-        # for 2.7 compat, we don't use exist_ok=True
+        # For 2.7 compat, we don't use `exist_ok=True`.
        os.makedirs(download_dir)

-    for filename in UnicodeFiles.ALL_FILES:
+    for filename in ALL_UNICODE_FILES:
        file_path = get_unicode_file_path(unicode_version, filename)

        if os.path.exists(file_path):
-            # assume file on the server didn't change if it's been saved before
+            # Assume file on the server didn't change if it's been saved before.
            continue

        if filename == UnicodeFiles.README:
@ -178,15 +181,16 @@ def check_stored_version(version):
    # type: (Optional[str]) -> Optional[UnicodeVersion]
    """
    Given desired Unicode version, return the version
-    if stored files are all present, and None otherwise.
+    if stored files are all present, and `None` otherwise.
    """
    if not version:
-        # should always check latest version
+        # If no desired version specified, we should check what's the latest
+        # version, skipping stored version checks.
        return None

    fetch_dir = os.path.join(FETCH_DIR, version)

-    for filename in UnicodeFiles.ALL_FILES:
+    for filename in ALL_UNICODE_FILES:
        file_path = os.path.join(fetch_dir, filename)

        if not os.path.exists(file_path):
@ -199,11 +203,11 @@ def check_stored_version(version):
 def parse_readme_unicode_version(readme_content):
    # type: (str) -> UnicodeVersion
    """
-    Parse the Unicode version contained in their ReadMe.txt file.
+    Parse the Unicode version contained in their `ReadMe.txt` file.
    """
-    # "raw string" is necessary for \d not being treated as escape char
-    # (for the sake of compat with future Python versions)
-    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    # "Raw string" is necessary for \d not being treated as escape char
+    # (for the sake of compat with future Python versions).
+    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
    pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
    groups = re.search(pattern, readme_content).groups()

@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
 def get_unicode_dir(unicode_version):
    # type: (UnicodeVersion) -> str
    """
-    Indicate where the unicode data files should be stored.
+    Indicate in which parent dir the Unicode data files should be stored.

    This returns a full, absolute path.
    """
@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
 def get_unicode_file_path(unicode_version, filename):
    # type: (UnicodeVersion, str) -> str
    """
-    Indicate where the unicode data file should be stored.
+    Indicate where the Unicode data file should be stored.
    """
    return os.path.join(get_unicode_dir(unicode_version), filename)

@ -239,22 +243,22 @@ def is_surrogate(n):
 def load_unicode_data(file_path):
    # type: (str) -> UnicodeData
    """
-    Load main unicode data.
+    Load main Unicode data.
    """
-    # conversions
+    # Conversions
    to_lower = {}   # type: Dict[int, Tuple[int, int, int]]
    to_upper = {}   # type: Dict[int, Tuple[int, int, int]]
    to_title = {}   # type: Dict[int, Tuple[int, int, int]]

-    # decompositions
+    # Decompositions
    compat_decomp = {}   # type: Dict[int, List[int]]
    canon_decomp = {}    # type: Dict[int, List[int]]

-    # combining characters
+    # Combining characters
    # FIXME: combines are not used
    combines = defaultdict(set)   # type: Dict[str, Set[int]]

-    # categories
+    # Categories
    general_categories = defaultdict(set)   # type: Dict[str, Set[int]]
    category_assigned_codepoints = set()    # type: Set[int]

@ -283,41 +287,42 @@ def load_unicode_data(file_path):
         decomp, deci, digit, num, mirror,
         old, iso, upcase, lowcase, titlecase) = data

-        # generate char to char direct common and simple conversions
-        # uppercase to lowercase
+        # Generate char to char direct common and simple conversions:
+
+        # Uppercase to lowercase
        if lowcase != "" and code_org != lowcase:
            to_lower[code] = (int(lowcase, 16), 0, 0)

-        # lowercase to uppercase
+        # Lowercase to uppercase
        if upcase != "" and code_org != upcase:
            to_upper[code] = (int(upcase, 16), 0, 0)

-        # title case
+        # Title case
        if titlecase.strip() != "" and code_org != titlecase:
            to_title[code] = (int(titlecase, 16), 0, 0)

-        # store decomposition, if given
+        # Store decomposition, if given
        if decomp:
            decompositions = decomp.split()[1:]
            decomp_code_points = [int(i, 16) for i in decompositions]

            if decomp.startswith("<"):
-                # compatibility decomposition
+                # Compatibility decomposition
                compat_decomp[code] = decomp_code_points
            else:
-                # canonical decomposition
+                # Canonical decomposition
                canon_decomp[code] = decomp_code_points

-        # place letter in categories as appropriate
+        # Place letter in categories as appropriate.
        for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
            general_categories[cat].add(code)
            category_assigned_codepoints.add(code)

-        # record combining class, if any
+        # Record combining class, if any.
        if combine != "0":
            combines[combine].add(code)

-    # generate Not_Assigned from Assigned
+    # Generate Not_Assigned from Assigned.
    general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)

    # Other contains Not_Assigned
@ -336,7 +341,7 @@ def load_unicode_data(file_path):
 def load_special_casing(file_path, unicode_data):
    # type: (str, UnicodeData) -> None
    """
-    Load special casing data and enrich given unicode data.
+    Load special casing data and enrich given Unicode data.
    """
    for line in fileinput.input(file_path):
        data = line.split("#")[0].split(";")
@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
    Load properties data and return in grouped form.
    """
    props = defaultdict(list)   # type: Dict[str, List[Tuple[int, int]]]
-    # "raw string" is necessary for \. and \w not to be treated as escape chars
-    # (for the sake of compat with future Python versions)
-    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
+    # (for the sake of compat with future Python versions).
+    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
            groups = match.groups()

            if len(groups) == 2:
-                # re1 matched
+                # `re1` matched (2 groups).
                d_lo, prop = groups
                d_hi = d_lo
            else:
@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):

        props[prop].append((lo_value, hi_value))

-    # optimize if possible
+    # Optimize if possible.
    for prop in props:
        props[prop] = group_codepoints(ungroup_codepoints(props[prop]))

@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
    for i in range(len(raw_data) // chunk_size):
        data = raw_data[i * chunk_size : (i + 1) * chunk_size]

-        # postfix compression of child nodes (data chunks)
-        # (identical child nodes are shared)
+        # Postfix compression of child nodes (data chunks)
+        # (identical child nodes are shared).

-        # make a tuple out of the list so it's hashable
+        # Make a tuple out of the list so it's hashable.
        child = tuple(data)
        if child not in childmap:
            childmap[child] = len(childmap)
@ -609,7 +614,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
    This yields string fragments that should be joined to produce
    the final string.

-    See: bool_trie.rs
+    See: `bool_trie.rs`.
    """
    chunk_size = 64
    rawdata = [False] * 0x110000
@ -617,7 +622,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
        for cp in range(lo, hi + 1):
            rawdata[cp] = True

-    # convert to bitmap chunks of chunk_size bits each
+    # Convert to bitmap chunks of `chunk_size` bits each.
    chunks = []
    for i in range(0x110000 // chunk_size):
        chunk = 0
@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
 def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
    # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
    """
-    Generate Rust code for SmallBoolTrie struct.
+    Generate Rust code for `SmallBoolTrie` struct.

-    See: bool_trie.rs
+    See: `bool_trie.rs`.
    """
    last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
    n_chunks = last_chunk + 1
@ -813,8 +818,8 @@ def main():
    unicode_version = fetch_files(args.version)
    print("Using Unicode version: {}".format(unicode_version.as_str))

-    # all the writing happens entirely in memory, we only write to file
-    # once we have generated the file content (it's not very large, <1 MB)
+    # All the writing happens entirely in memory, we only write to file
+    # once we have generated the file content (it's not very large, <1 MB).
    buf = StringIO()
    buf.write(PREAMBLE)

@ -844,7 +849,7 @@ def main():
                            {"White_Space", "Join_Control", "Noncharacter_Code_Point",
                             "Pattern_White_Space"})

-    # category tables
+    # Category tables
    for (name, categories, category_subset) in (
            ("general_category", unicode_data.general_categories, ["N", "Cc"]),
            ("derived_property", derived, want_derived),
@ -858,7 +863,8 @@ def main():

    tables_rs_path = os.path.join(THIS_DIR, "tables.rs")

-    # will overwrite the file if it exists
+    # Actually write out the file content.
+    # Will overwrite the file if it exists.
    with open(tables_rs_path, "w") as fd:
        fd.write(buf.getvalue())