#include <optional>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionTokens.h>
#include <Common/UTF8Helpers.h>
#include <Common/Exception.h>
#include <base/types.h>
#include <Common/HashTable/Hash.h>

namespace DB
{

namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}

/** Functions that finds all substrings win minimal length n
  * such their border (n-1)-grams' hashes are more than hashes of every (n-1)-grams' in substring.
  * As a hash function use zlib crc32, which is crc32-ieee with 0xffffffff as initial value
  *
  * sparseGrams(s)
  */
namespace
{

struct CRC32CHasher
{
    size_t operator()(const char* data, size_t length) const
    {
        return updateWeakHash32(reinterpret_cast<const UInt8*>(data), length, 0);
    }
};

using Pos = const char *;

template <bool is_utf8>
class SparseGramsImpl
{
private:
    CRC32CHasher hasher;

    Pos pos;
    Pos end;
    UInt64 min_ngram_length = 3;
    UInt64 max_ngram_length = 100;

    /// Current batch of answers. The size of result can not be greater than `convex_hull`.
    /// The size of `convex_hull` should not be large, see comment to `convex_hull` for more details.
    std::vector<std::pair<size_t, size_t>> result;
    size_t iter_result = 0;

    struct PositionAndHash
    {
        size_t position;
        size_t left_ngram_position;
        size_t symbol_index;
        size_t hash;
    };

    class NGramSymbolIterator
    {
    public:
        NGramSymbolIterator() = default;

        NGramSymbolIterator(Pos data_, Pos end_, size_t n_)
            : data(data_), end(end_), n(n_)
        {
        }

        bool increment()
        {
            if (isEnd())
                return false;

            right_iterator = getNextPosition(right_iterator);

            if (++num_increments >= n)
                left_iterator = getNextPosition(left_iterator);

            return true;
        }

        bool isEnd() const
        {
            return data + right_iterator >= end;
        }

        std::pair<size_t, size_t> getNGramPositions() const
        {
            return {left_iterator, right_iterator};
        }

        size_t getRightSymbol() const
        {
            return num_increments;
        }

        size_t getNextPosition(size_t iterator) const
        {
            if constexpr (is_utf8)
                return iterator + UTF8::seqLength(data[iterator]);
            else
                return iterator + 1;
        }

    private:

        Pos data;
        Pos end;
        size_t n;
        size_t right_iterator = 0;
        size_t left_iterator = 0;
        size_t num_increments = 0;
    };

    /// The convex hull contains the maximum values ​​of the suffixes that start from the current right iterator.
    /// For example, if we have n-gram hashes like [1,5,2,4,1,3] and current right position is 4 (the last one)
    /// than our convex hull will consists of elements:
    /// [{position:1, hash:5}, {position:3, hash:4}, {position:4,hash:1}]
    /// Assuming that hashes are uniformly distributed, the expected size of convex_hull is N^{1/3},
    /// where N is the length of the string.
    /// Proof: https://math.stackexchange.com/questions/3469295/expected-number-of-vertices-in-a-convex-hull
    std::vector<PositionAndHash> convex_hull;
    NGramSymbolIterator symbol_iterator;

    /// Get the next batch of answers. Returns false if there can be no more answers.
    bool consume()
    {
        if (symbol_iterator.isEnd())
            return false;

        auto [ngram_left_position, right_position] = symbol_iterator.getNGramPositions();
        size_t right_symbol_index = symbol_iterator.getRightSymbol();
        size_t next_right_position = symbol_iterator.getNextPosition(right_position);
        size_t right_border_ngram_hash = hasher(pos + ngram_left_position, next_right_position - ngram_left_position);

        while (!convex_hull.empty() && convex_hull.back().hash < right_border_ngram_hash)
        {
            size_t possible_left_position = convex_hull.back().left_ngram_position;
            size_t possible_left_symbol_index = convex_hull.back().symbol_index;
            size_t length = right_symbol_index - possible_left_symbol_index + 2;
            if (length > max_ngram_length)
            {
                /// If the current length is greater than the current right position, it will be greater at future right positions, so we can just delete them all.
                convex_hull.clear();
                break;
            }
            result.push_back({possible_left_position, next_right_position});
            convex_hull.pop_back();
        }

        if (!convex_hull.empty())
        {
            size_t possible_left_position = convex_hull.back().left_ngram_position;
            size_t possible_left_symbol_index = convex_hull.back().symbol_index;
            size_t length = right_symbol_index - possible_left_symbol_index + 2;
            if (length <= max_ngram_length)
                result.push_back({possible_left_position, next_right_position});
        }

        /// there should not be identical hashes in the convex hull. If there are, then we leave only the last one
        while (!convex_hull.empty() && convex_hull.back().hash == right_border_ngram_hash)
            convex_hull.pop_back();

        convex_hull.push_back(PositionAndHash{
            .position = right_position,
            .left_ngram_position = ngram_left_position,
            .symbol_index = right_symbol_index,
            .hash = right_border_ngram_hash
        });
        symbol_iterator.increment();
        return true;
    }

    std::optional<std::pair<size_t, size_t>> getNextIndices()
    {
        if (result.size() <= iter_result)
        {
            result.clear();
            iter_result = 0;

            if (!consume())
                return std::nullopt;

            return getNextIndices();
        }

        return result[iter_result++];
    }

public:
    static constexpr auto name = is_utf8 ? "sparseGramsUTF8" : "sparseGrams";
    static constexpr auto strings_argument_position = 0uz;
    static bool isVariadic() { return true; }
    static size_t getNumberOfArguments() { return 0; }
    static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1}; }

    static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
    {
        FunctionArgumentDescriptors mandatory_args{
            {"s", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isString), nullptr, "String"},
        };

        FunctionArgumentDescriptors optional_args{
            {"min_ngram_length", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isNativeInteger), isColumnConst, "const Number"},
            {"max_ngram_length", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isNativeInteger), isColumnConst, "const Number"},
        };

        validateFunctionArguments(func, arguments, mandatory_args, optional_args);
    }

    void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/)
    {
        if (arguments.size() > 3)
            throw Exception(
                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                "Number of arguments for function {} doesn't match: passed {}, must be from 1 to 3",
                name,
                arguments.size());

        if (arguments.size() >= 2)
            min_ngram_length = arguments[1].column->getUInt(0);

        if (min_ngram_length < 3)
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument 'min_ngram_length' must be greater or equal to 3");

        if (arguments.size() == 3)
            max_ngram_length = arguments[2].column->getUInt(0);

        if (max_ngram_length < min_ngram_length)
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument 'max_ngram_length' must be greater or equal to 'min_ngram_length'");
    }

    /// Called for each next string.
    void set(Pos pos_, Pos end_)
    {
        pos = pos_;
        end = end_;

        symbol_iterator = NGramSymbolIterator(pos, end, min_ngram_length - 1);
        for (size_t i = 0; i < min_ngram_length - 2; ++i)
            if (!symbol_iterator.increment())
                return;
    }

    /// Get the next token, if any, or return false.
    bool get(Pos & token_begin, Pos & token_end)
    {
        auto cur_result = getNextIndices();
        if (!cur_result)
            return false;

        auto [iter_left, iter_right] = *cur_result;

        token_begin = pos + iter_left;
        token_end = pos + iter_right;
        return true;
    }
};

template <bool is_utf8>
class SparseGramsHashes : public IFunction
{
public:
    static constexpr auto name = is_utf8 ? "sparseGramsHashesUTF8" : "sparseGramsHashes";
    String getName() const override { return name; }
    bool isVariadic() const override { return true; }
    size_t getNumberOfArguments() const override { return 0; }
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
    bool useDefaultImplementationForConstants() const override { return true; }
    static FunctionPtr create(ContextPtr) { return std::make_shared<SparseGramsHashes>(); }
    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }

    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & args) const override
    {
        SparseGramsImpl<is_utf8>::checkArguments(*this, args);
        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt32>());
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
        SparseGramsImpl<is_utf8> impl;
        impl.init(arguments, false);

        CRC32CHasher hasher;

        auto col_res = ColumnUInt32::create();
        auto & res_data = col_res->getData();

        auto col_res_offsets = ColumnArray::ColumnOffsets::create();
        auto & res_offsets_data = col_res_offsets->getData();

        auto string_arg = arguments[impl.strings_argument_position].column.get();

        if (const auto * col_string = checkAndGetColumn<ColumnString>(string_arg))
        {
            const auto & src_data = col_string->getChars();
            const auto & src_offsets = col_string->getOffsets();

            res_offsets_data.reserve(input_rows_count);
            res_data.reserve(src_data.size());

            ColumnString::Offset current_src_offset = 0;
            Pos start{};
            Pos end{};

            for (size_t i = 0; i < input_rows_count; ++i)
            {
                start = reinterpret_cast<Pos>(&src_data[current_src_offset]);
                current_src_offset = src_offsets[i];
                end = reinterpret_cast<Pos>(&src_data[current_src_offset]);
                impl.set(start, end);
                while (impl.get(start, end))
                    res_data.push_back(hasher(start, end - start));

                res_offsets_data.push_back(res_data.size());
            }

            return ColumnArray::create(std::move(col_res), std::move(col_res_offsets));
        }

        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name);
    }
};

using FunctionSparseGrams = FunctionTokens<SparseGramsImpl<false>>;
using FunctionSparseGramsUTF8 = FunctionTokens<SparseGramsImpl<true>>;

}

REGISTER_FUNCTION(SparseGrams)
{
    FunctionDocumentation::Description description_sparse = R"(
Finds all substrings of a given string that have a length of at least `n`,
where the hashes of the (n-1)-grams at the borders of the substring
are strictly greater than those of any (n-1)-gram inside the substring.
Uses `CRC32` as a hash function.
)";
    FunctionDocumentation::Syntax syntax_sparse = "sparseGrams(s[, min_ngram_length, max_ngram_length])";
    FunctionDocumentation::Arguments arguments_sparse = {
        {"s", "An input string.", {"String"}},
        {"min_ngram_length", "Optional. The minimum length of extracted ngram. The default and minimal value is 3.", {"UInt*"}},
        {"max_ngram_length", "Optional. The maximum length of extracted ngram. The default value is 100. Should be not less than `min_ngram_length`.", {"UInt*"}}
    };
    FunctionDocumentation::ReturnedValue returned_value_sparse = {"Returns an array of selected substrings.", {"Array(String)"}};
    FunctionDocumentation::Examples examples_sparse = {
    {
        "Usage example",
        "SELECT sparseGrams('alice', 3)",
        R"(
┌─sparseGrams('alice', 3)────────────┐
│ ['ali','lic','lice','ice']         │
└────────────────────────────────────┘
        )"
    }
    };
    FunctionDocumentation::IntroducedIn introduced_in = {25, 5};
    FunctionDocumentation::Category category = FunctionDocumentation::Category::String;
    FunctionDocumentation documentation_sparse = {description_sparse, syntax_sparse, arguments_sparse, returned_value_sparse, examples_sparse, introduced_in, category};

    FunctionDocumentation::Description description_sparse_utf8 = R"(
Finds all substrings of a given UTF-8 string that have a length of at least `n`, where the hashes of the (n-1)-grams at the borders of the substring are strictly greater than those of any (n-1)-gram inside the substring.
Expects a UTF-8 string, throws an exception in case of an invalid UTF-8 sequence.
Uses `CRC32` as a hash function.
)";
    FunctionDocumentation::Syntax syntax_sparse_utf8 = "sparseGramsUTF8(s[, min_ngram_length, max_ngram_length])";
    FunctionDocumentation::ReturnedValue returned_value_sparse_utf8 = {"Returns an array of selected UTF-8 substrings.", {"Array(String)"}};
    FunctionDocumentation::Examples examples_sparse_utf8 = {
    {
        "Usage example",
        "SELECT sparseGramsUTF8('алиса', 3)",
        R"(
┌─sparseGramsUTF8('алиса', 3)─┐
│ ['али','лис','иса']         │
└─────────────────────────────┘
        )"
    }
    };
    FunctionDocumentation documentation_sparse_utf8 = {description_sparse_utf8, syntax_sparse_utf8, arguments_sparse, returned_value_sparse_utf8, examples_sparse_utf8, introduced_in, category};

    FunctionDocumentation::Description description_hashes = R"(
Finds hashes of all substrings of a given string that have a length of at least `n`,
where the hashes of the (n-1)-grams at the borders of the substring
are strictly greater than those of any (n-1)-gram inside the substring.
Uses `CRC32` as a hash function.
)";
    FunctionDocumentation::Syntax syntax_hashes = "sparseGramsHashes(s[, min_ngram_length, max_ngram_length])";
    FunctionDocumentation::ReturnedValue returned_value_hashes = {"Returns an array of selected substrings CRC32 hashes.", {"Array(UInt32)"}};
    FunctionDocumentation::Examples examples_hashes = {
    {
        "Usage example",
        "SELECT sparseGramsHashes('alice', 3)",
        R"(
┌─sparseGramsHashes('alice', 3)──────────────────────┐
│ [1481062250,2450405249,4012725991,1918774096]      │
└────────────────────────────────────────────────────┘
        )"
    }
    };
    FunctionDocumentation documentation_hashes = {description_hashes, syntax_hashes, arguments_sparse, returned_value_hashes, examples_hashes, introduced_in, category};

    FunctionDocumentation::Description description_hashes_utf8 = R"(
Finds hashes of all substrings of a given UTF-8 string that have a length of at least `n`, where the hashes of the (n-1)-grams at the borders of the substring are strictly greater than those of any (n-1)-gram inside the substring.
Expects UTF-8 string, throws an exception in case of invalid UTF-8 sequence.
Uses `CRC32` as a hash function.
)";
    FunctionDocumentation::Syntax syntax_hashes_utf8 = "sparseGramsHashesUTF8(s[, min_ngram_length, max_ngram_length])";
    FunctionDocumentation::ReturnedValue returned_value_hashes_utf8 = {"Returns an array of selected UTF-8 substrings CRC32 hashes.", {"Array(UInt32)"}};
    FunctionDocumentation::Examples examples_hashes_utf8 = {
    {
        "Usage example",
        "SELECT sparseGramsHashesUTF8('алиса', 3)",
        R"(
┌─sparseGramsHashesUTF8('алиса', 3)─┐
│ [4178533925,3855635300,561830861] │
└───────────────────────────────────┘
        )"
    }
    };
    FunctionDocumentation documentation_hashes_utf8 = {description_hashes_utf8, syntax_hashes_utf8, arguments_sparse, returned_value_hashes_utf8, examples_hashes_utf8, introduced_in, category};

    factory.registerFunction<FunctionSparseGrams>(documentation_sparse);
    factory.registerFunction<FunctionSparseGramsUTF8>(documentation_sparse_utf8);

    factory.registerFunction<SparseGramsHashes<false>>(documentation_hashes);
    factory.registerFunction<SparseGramsHashes<true>>(documentation_hashes_utf8);
}

}
