Address review remarks in unicode.py

This commit is contained in:
Paweł Romanowski 2019-06-10 21:13:01 +02:00
parent 60ccf89693
commit 2b47a085dd

View file

@ -34,7 +34,7 @@ except ImportError:
from StringIO import StringIO
try:
# completely optional type hinting
# Completely optional type hinting
# (Python 2 compatible using comments,
# see: https://mypy.readthedocs.io/en/latest/python2.html)
# This is very helpful in typing-aware IDE like PyCharm.
@ -43,9 +43,9 @@ except ImportError:
pass
# we don't use enum.Enum because of Python 2.7 compatibility
# We don't use enum.Enum because of Python 2.7 compatibility.
class UnicodeFiles(object):
# ReadMe does not contain any unicode data, we
# ReadMe does not contain any Unicode data, we
# only use it to extract versions.
README = "ReadMe.txt"
@ -57,11 +57,15 @@ class UnicodeFiles(object):
UNICODE_DATA = "UnicodeData.txt"
UnicodeFiles.ALL_FILES = tuple(
getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
# The order doesn't really matter (Python < 3.6 won't preserve it),
# we only want to aggregate all the file names.
ALL_UNICODE_FILES = tuple(
value for name, value in UnicodeFiles.__dict__.items()
if not name.startswith("_")
)
assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
# The directory this file is located in.
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
@ -97,18 +101,17 @@ EXPANDED_CATEGORIES = {
# This is the (inclusive) range of surrogate codepoints.
# These are not valid Rust characters.
# - they are not valid Rust characters
SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
UnicodeData = namedtuple(
"UnicodeData", (
# conversions:
# Conversions:
"to_upper", "to_lower", "to_title",
# decompositions: canonical decompositions, compatibility decomp
# Decompositions: canonical decompositions, compatibility decomp
"canon_decomp", "compat_decomp",
# grouped: general categories and combining characters
# Grouped: general categories and combining characters
"general_categories", "combines",
)
)
@ -136,10 +139,10 @@ def fetch_files(version=None):
return have_version
if version:
# check if the desired version exists on the server
# Check if the desired version exists on the server.
get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
else:
# extract the latest version
# Extract the latest version.
get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
readme_url = get_fetch_url(UnicodeFiles.README)
@ -153,14 +156,14 @@ def fetch_files(version=None):
download_dir = get_unicode_dir(unicode_version)
if not os.path.exists(download_dir):
# for 2.7 compat, we don't use exist_ok=True
# For 2.7 compat, we don't use `exist_ok=True`.
os.makedirs(download_dir)
for filename in UnicodeFiles.ALL_FILES:
for filename in ALL_UNICODE_FILES:
file_path = get_unicode_file_path(unicode_version, filename)
if os.path.exists(file_path):
# assume file on the server didn't change if it's been saved before
# Assume file on the server didn't change if it's been saved before.
continue
if filename == UnicodeFiles.README:
@ -178,15 +181,16 @@ def check_stored_version(version):
# type: (Optional[str]) -> Optional[UnicodeVersion]
"""
Given desired Unicode version, return the version
if stored files are all present, and None otherwise.
if stored files are all present, and `None` otherwise.
"""
if not version:
# should always check latest version
# If no desired version specified, we should check what's the latest
# version, skipping stored version checks.
return None
fetch_dir = os.path.join(FETCH_DIR, version)
for filename in UnicodeFiles.ALL_FILES:
for filename in ALL_UNICODE_FILES:
file_path = os.path.join(fetch_dir, filename)
if not os.path.exists(file_path):
@ -199,11 +203,11 @@ def check_stored_version(version):
def parse_readme_unicode_version(readme_content):
# type: (str) -> UnicodeVersion
"""
Parse the Unicode version contained in their ReadMe.txt file.
Parse the Unicode version contained in their `ReadMe.txt` file.
"""
# "raw string" is necessary for \d not being treated as escape char
# (for the sake of compat with future Python versions)
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
# "Raw string" is necessary for \d not being treated as escape char
# (for the sake of compat with future Python versions).
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
groups = re.search(pattern, readme_content).groups()
@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
def get_unicode_dir(unicode_version):
# type: (UnicodeVersion) -> str
"""
Indicate where the unicode data files should be stored.
Indicate in which parent dir the Unicode data files should be stored.
This returns a full, absolute path.
"""
@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
def get_unicode_file_path(unicode_version, filename):
# type: (UnicodeVersion, str) -> str
"""
Indicate where the unicode data file should be stored.
Indicate where the Unicode data file should be stored.
"""
return os.path.join(get_unicode_dir(unicode_version), filename)
@ -239,22 +243,22 @@ def is_surrogate(n):
def load_unicode_data(file_path):
# type: (str) -> UnicodeData
"""
Load main unicode data.
Load main Unicode data.
"""
# conversions
# Conversions
to_lower = {} # type: Dict[int, Tuple[int, int, int]]
to_upper = {} # type: Dict[int, Tuple[int, int, int]]
to_title = {} # type: Dict[int, Tuple[int, int, int]]
# decompositions
# Decompositions
compat_decomp = {} # type: Dict[int, List[int]]
canon_decomp = {} # type: Dict[int, List[int]]
# combining characters
# Combining characters
# FIXME: combines are not used
combines = defaultdict(set) # type: Dict[str, Set[int]]
# categories
# Categories
general_categories = defaultdict(set) # type: Dict[str, Set[int]]
category_assigned_codepoints = set() # type: Set[int]
@ -283,41 +287,42 @@ def load_unicode_data(file_path):
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcase, titlecase) = data
# generate char to char direct common and simple conversions
# uppercase to lowercase
# Generate char to char direct common and simple conversions:
# Uppercase to lowercase
if lowcase != "" and code_org != lowcase:
to_lower[code] = (int(lowcase, 16), 0, 0)
# lowercase to uppercase
# Lowercase to uppercase
if upcase != "" and code_org != upcase:
to_upper[code] = (int(upcase, 16), 0, 0)
# title case
# Title case
if titlecase.strip() != "" and code_org != titlecase:
to_title[code] = (int(titlecase, 16), 0, 0)
# store decomposition, if given
# Store decomposition, if given
if decomp:
decompositions = decomp.split()[1:]
decomp_code_points = [int(i, 16) for i in decompositions]
if decomp.startswith("<"):
# compatibility decomposition
# Compatibility decomposition
compat_decomp[code] = decomp_code_points
else:
# canonical decomposition
# Canonical decomposition
canon_decomp[code] = decomp_code_points
# place letter in categories as appropriate
# Place letter in categories as appropriate.
for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
general_categories[cat].add(code)
category_assigned_codepoints.add(code)
# record combining class, if any
# Record combining class, if any.
if combine != "0":
combines[combine].add(code)
# generate Not_Assigned from Assigned
# Generate Not_Assigned from Assigned.
general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
# Other contains Not_Assigned
@ -336,7 +341,7 @@ def load_unicode_data(file_path):
def load_special_casing(file_path, unicode_data):
# type: (str, UnicodeData) -> None
"""
Load special casing data and enrich given unicode data.
Load special casing data and enrich given Unicode data.
"""
for line in fileinput.input(file_path):
data = line.split("#")[0].split(";")
@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
Load properties data and return in grouped form.
"""
props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]]
# "raw string" is necessary for \. and \w not to be treated as escape chars
# (for the sake of compat with future Python versions)
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
# "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
# (for the sake of compat with future Python versions).
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
groups = match.groups()
if len(groups) == 2:
# re1 matched
# `re1` matched (2 groups).
d_lo, prop = groups
d_hi = d_lo
else:
@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):
props[prop].append((lo_value, hi_value))
# optimize if possible
# Optimize if possible.
for prop in props:
props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
for i in range(len(raw_data) // chunk_size):
data = raw_data[i * chunk_size : (i + 1) * chunk_size]
# postfix compression of child nodes (data chunks)
# (identical child nodes are shared)
# Postfix compression of child nodes (data chunks)
# (identical child nodes are shared).
# make a tuple out of the list so it's hashable
# Make a tuple out of the list so it's hashable.
child = tuple(data)
if child not in childmap:
childmap[child] = len(childmap)
@ -609,7 +614,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
This yields string fragments that should be joined to produce
the final string.
See: bool_trie.rs
See: `bool_trie.rs`.
"""
chunk_size = 64
rawdata = [False] * 0x110000
@ -617,7 +622,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
for cp in range(lo, hi + 1):
rawdata[cp] = True
# convert to bitmap chunks of chunk_size bits each
# Convert to bitmap chunks of `chunk_size` bits each.
chunks = []
for i in range(0x110000 // chunk_size):
chunk = 0
@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
# type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
"""
Generate Rust code for SmallBoolTrie struct.
Generate Rust code for `SmallBoolTrie` struct.
See: bool_trie.rs
See: `bool_trie.rs`.
"""
last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
n_chunks = last_chunk + 1
@ -813,8 +818,8 @@ def main():
unicode_version = fetch_files(args.version)
print("Using Unicode version: {}".format(unicode_version.as_str))
# all the writing happens entirely in memory, we only write to file
# once we have generated the file content (it's not very large, <1 MB)
# All the writing happens entirely in memory, we only write to file
# once we have generated the file content (it's not very large, <1 MB).
buf = StringIO()
buf.write(PREAMBLE)
@ -844,7 +849,7 @@ def main():
{"White_Space", "Join_Control", "Noncharacter_Code_Point",
"Pattern_White_Space"})
# category tables
# Category tables
for (name, categories, category_subset) in (
("general_category", unicode_data.general_categories, ["N", "Cc"]),
("derived_property", derived, want_derived),
@ -858,7 +863,8 @@ def main():
tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
# will overwrite the file if it exists
# Actually write out the file content.
# Will overwrite the file if it exists.
with open(tables_rs_path, "w") as fd:
fd.write(buf.getvalue())