# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

from io import StringIO
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional
import re


@dataclass
class PropertyRange:
    lower: int = -1
    upper: int = -1
    prop: str = None


@dataclass
class PropertyTable:
    lower_bounds: list[int] = field(default_factory=list)
    props_and_range: list[int] = field(default_factory=list)


LINE_REGEX = re.compile(
    r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)")

def parse_property_line(inputLine: str) -> Optional[PropertyRange]:
    result = PropertyRange()
    if m := LINE_REGEX.match(inputLine):
        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
        result.lower = int(lower_str, base=16)
        result.upper = result.lower
        if upper_str is not None:
            result.upper = int(upper_str, base=16)
        return result
    else:
        return None


def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
    """
    Merges consecutive ranges with the same property to one range.

    Merging the ranges results in fewer ranges in the output table,
    reducing binary size and improving lookup performance.
    """
    result = list()
    for x in input:
        if (
            len(result)
            and result[-1].prop == x.prop
            and result[-1].upper + 1 == x.lower
        ):
            result[-1].upper = x.upper
            continue
        result.append(x)
    return result


PROP_VALUE_ENUMERATOR_TEMPLATE = "_{}_value"
PROP_VALUE_ENUM_TEMPLATE = """
{filename}
{timestamp}
enum class _{prop_name}_property_values : uint8_t {{
    {enumerators},
    _No_value = 255
}};
"""

DATA_ARRAY_TEMPLATE = """
{filename}
{timestamp}
inline constexpr _Unicode_property_data<_{prop_name}_property_values, {size}, {is_binary_property}>
    _{prop_name}_property_data{{
    {{{lower_bounds}}},
    {{{props_and_size}}}
}};
"""

WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
{filename}
{timestamp}
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{
    {data}
}};
"""

MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
// __msvc_format_ucd_tables.hpp internal header

// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// WARNING, this entire header is generated by
// tools/unicode_properties_parse/unicode_properties_data_gen.py
// DO NOT MODIFY!

// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
//
// See Terms of Use <https://www.unicode.org/copyright.html>
// for definitions of Unicode Inc.'s Data Files and Software.
//
// NOTICE TO USER: Carefully read the following legal agreement.
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
// TERMS AND CONDITIONS OF THIS AGREEMENT.
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
// THE DATA FILES OR SOFTWARE.
//
// COPYRIGHT AND PERMISSION NOTICE
//
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of the Unicode data files and any associated documentation
// (the "Data Files") or Unicode software and any associated documentation
// (the "Software") to deal in the Data Files or Software
// without restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, and/or sell copies of
// the Data Files or Software, and to permit persons to whom the Data Files
// or Software are furnished to do so, provided that either
// (a) this copyright and permission notice appear with all copies
// of the Data Files or Software, or
// (b) this copyright and permission notice appear in associated
// Documentation.
//
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder
// shall not be used in advertising or otherwise to promote the sale,
// use or other dealings in these Data Files or Software without prior
// written authorization of the copyright holder.

#ifndef __MSVC_FORMAT_UCD_TABLES_HPP
#define __MSVC_FORMAT_UCD_TABLES_HPP
#include <yvals_core.h>
#if _STL_COMPILER_PREPROCESSOR

#include <cstdint>
#include <xutility>

#pragma pack(push, _CRT_PACKING)
#pragma warning(push, _STL_WARNING_LEVEL)
#pragma warning(disable : _STL_DISABLED_WARNINGS)
_STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

_STD_BEGIN

template <class _ValueEnum, size_t _NumRanges, bool _Is_binary_property>
struct _Unicode_property_data {{
    uint32_t _Lower_bounds[_NumRanges];
    uint16_t _Props_and_size[_NumRanges];
    _NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{
        ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
        constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
        if (_Upper_idx == 0) {{
            return _No_value_constant;
        }}
        --_Upper_idx;
        const uint32_t _Lower_bound = _Lower_bounds[_Upper_idx];
        const uint16_t _Data        = _Props_and_size[_Upper_idx];
        _STL_INTERNAL_CHECK(_Code_point >= _Lower_bound);
        if constexpr (_Is_binary_property) {{
            if (_Code_point < _Lower_bound + _Data) {{
                return static_cast<_ValueEnum>(0);
            }}
        }} else {{
            const uint16_t _Size   = static_cast<uint16_t>(_Data & 0x0FFF);
            const _ValueEnum _Prop = static_cast<_ValueEnum>((_Data & 0xF000) >> 12);
            if (_Code_point < _Lower_bound + _Size) {{
                return _Prop;
            }}
        }}
        return _No_value_constant;
    }}
}};

// The following static data tables are generated from the Unicode character database.
// _Grapheme_Break_property_data comes from ucd/auxiliary/GraphemeBreakProperty.txt.
//
// _Extended_Pictographic_property_data comes from ucd/emoji/emoji-data.txt.
//
// __printable_property_data comes from ucd/extracted/DerivedGeneralCategory.txt.
//
// _Grapheme_Extend_property_data comes from ucd/DerivedCoreProperties.txt.
//
// The enums containing the values for the properties are also generated, in order to ensure they match
// up correctly with how we're parsing them.
//
// All sets of data tables are generated by tools/unicode_properties_parse/unicode_properties_data_gen.py in the
// https://github.com/microsoft/stl repository.
//
// The data format is a set of arrays for each character property. The first is an array of uint32_t encoding
// the lower bound of each range of codepoints that has the given property.
// The second is an array of uint16_t.
// - For enumerated properties, this array encodes both the range size and property value as follows:
//   16               12                                   0
//   +-----------------------------------------------------+
//   | property_value  |              range_size           |
//   +-----------------------------------------------------+
//   that is: the size is stored in the least significant 12 bits
//   (leading to a max size of 4095), and the property value is stored in the most significant 4 bits,
//   leading to a maximum of 16 property values.
// - For binary properties, this array simply stores the range size.
//
// Codepoint ranges may not overlap, and, within one property, a codepoint may only appear once. Furthermore the
// codepoint lower bounds appear in sorted (ascending) order.

{content}
_STD_END

#pragma pop_macro("new")
_STL_RESTORE_CLANG_WARNINGS
#pragma warning(pop)
#pragma pack(pop)

#endif // _STL_COMPILER_PREPROCESSOR
#endif // __MSVC_FORMAT_UCD_TABLES_HPP
"""

def property_ranges_to_table(ranges: list[PropertyRange], props: list[str], is_binary_property: bool) -> PropertyTable:
    result = PropertyTable()
    for range in sorted(ranges, key=lambda x: x.lower):
        result.lower_bounds.append(range.lower)
        size = (range.upper - range.lower) + 1
        if is_binary_property:
            assert size <= 0xFFFF
            result.props_and_range.append(size)
        else:
            assert size <= 0x0FFF
            prop_idx = props.index(range.prop)
            result.props_and_range.append(size | (prop_idx << 12))
    return result


def generate_cpp_data(filename: str, timestamp: str, prop_name: str, ranges: list[PropertyRange]) -> str:
    result = StringIO()
    prop_values = sorted(set(x.prop for x in ranges))
    is_binary_property = len(prop_values) == 1
    table = property_ranges_to_table(ranges, prop_values, is_binary_property)
    enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(
        x) for x in prop_values]
    result.write(PROP_VALUE_ENUM_TEMPLATE.lstrip().format(
        filename=filename, timestamp=timestamp, prop_name=prop_name, enumerators=",".join(enumerator_values)))
    result.write("\n")
    result.write(DATA_ARRAY_TEMPLATE.lstrip().format(filename=filename, timestamp=timestamp, prop_name=prop_name,
                 size=len(table.lower_bounds),
                 is_binary_property='true' if is_binary_property else 'false',
                 lower_bounds=",".join(["0x" + format(x, 'x') for x in table.lower_bounds]),
                 props_and_size=",".join(["0x" + format(x, 'x') for x in table.props_and_range])))
    return result.getvalue()


def read_file(filename: str) -> list[PropertyRange]:
    data_path = Path(__file__).absolute().with_name(filename)
    with data_path.open(encoding='utf-8') as f:
        filename = f.readline().replace("#", "//").rstrip()
        timestamp = f.readline().replace("#", "//").rstrip()
        ranges = compact_property_ranges([x for line in f if (x := parse_property_line(line))])
        return filename, timestamp, ranges


def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]):
    values = []

    for width_2_range in width_2_ranges:
        values.append(width_2_range.lower)
        values.append(width_2_range.upper + 1)

    return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
            filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values]))


def generate_data_tables() -> str:
    """
    Generate Unicode data for inclusion into <format> from
    GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt,
    and EastAsianWidth.txt.

    GraphemeBreakProperty.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

    emoji-data.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt

    DerivedGeneralCategory.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt

    DerivedCoreProperties.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt

    EastAsianWidth.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt

    All files are expected to be in the same directory as this script.
    """
    gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt")
    emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt")
    cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt")
    derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt")
    eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt")

    printable_ranges = compact_property_ranges(sorted([
        PropertyRange(x.lower, x.upper, "Yes")
        for x in cat_ranges
        if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' '
    ], key=lambda x: x.lower))

    # N4971 [format.string.std]/13
    std_wide_ranges = [
        range(0x4DC0, 0x4DFF),
        range(0x1F300, 0x1F5FF),
        range(0x1F900, 0x1F9FF),
    ]

    def has_width_2(prop_range):
        if prop_range.prop in ("F", "W"):
            return True

        for std_wide_range in std_wide_ranges:
            if prop_range.lower in std_wide_range:
                assert prop_range.upper <= std_wide_range.stop

                return True
            else:
                assert prop_range.upper not in std_wide_range

        return False

    width_2_ranges = compact_property_ranges(sorted([
        PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x)
    ], key=lambda x: x.lower))

    gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges)
    emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [
        x for x in emoji_ranges if x.prop == "Extended_Pictographic"])
    # _printable follows a different naming scheme, to indicate that it is a fake Unicode property.
    printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges)
    grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [
        x for x in derived_ranges if x.prop == "Grapheme_Extend"])
    width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges)

    return "\n".join(
        [gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals])


if __name__ == "__main__":
    print(MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables()))
