Submission #67791514


Source Code Expand

// #undef YOSUPO_LOCAL

#if 0 and !defined(__clang__)
#include <vector>
#pragma GCC target("avx2,bmi,bmi2,lzcnt,popcnt")
#pragma GCC optimize("Ofast")
#endif


#include <stdio.h>
#include <unistd.h>

#include <algorithm>
#include <array>
#include <bit>
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>



namespace yosupo {

namespace internal {

template <class T>
using is_signed_int128 =
    typename std::conditional<std::is_same<T, __int128_t>::value ||
                                  std::is_same<T, __int128>::value,
                              std::true_type,
                              std::false_type>::type;

template <class T>
using is_unsigned_int128 =
    typename std::conditional<std::is_same<T, __uint128_t>::value ||
                                  std::is_same<T, unsigned __int128>::value,
                              std::true_type,
                              std::false_type>::type;

template <class T>
using make_unsigned_int128 =
    typename std::conditional<std::is_same<T, __int128_t>::value,
                              __uint128_t,
                              unsigned __int128>;

template <class T>
using is_integral =
    typename std::conditional<std::is_integral<T>::value ||
                                  internal::is_signed_int128<T>::value ||
                                  internal::is_unsigned_int128<T>::value,
                              std::true_type,
                              std::false_type>::type;

template <class T>
using is_signed_int = typename std::conditional<(is_integral<T>::value &&
                                                 std::is_signed<T>::value) ||
                                                    is_signed_int128<T>::value,
                                                std::true_type,
                                                std::false_type>::type;

template <class T>
using is_unsigned_int =
    typename std::conditional<(is_integral<T>::value &&
                               std::is_unsigned<T>::value) ||
                                  is_unsigned_int128<T>::value,
                              std::true_type,
                              std::false_type>::type;

template <class T>
using to_unsigned = typename std::conditional<
    is_signed_int128<T>::value,
    make_unsigned_int128<T>,
    typename std::conditional<std::is_signed<T>::value,
                              std::make_unsigned<T>,
                              std::common_type<T>>::type>::type;

template <class T>
using is_integral_t = std::enable_if_t<is_integral<T>::value>;

template <class T>
using is_signed_int_t = std::enable_if_t<is_signed_int<T>::value>;

template <class T>
using is_unsigned_int_t = std::enable_if_t<is_unsigned_int<T>::value>;

template <class T> using to_unsigned_t = typename to_unsigned<T>::type;

}  // namespace internal

}  // namespace yosupo

namespace yosupo {

struct Scanner {
  public:
    Scanner(const Scanner&) = delete;
    Scanner& operator=(const Scanner&) = delete;

    Scanner(FILE* fp) : fd(fileno(fp)) { line[0] = 127; }

    void read() {}
    template <class H, class... T> void read(H& h, T&... t) {
        bool f = read_single(h);
        assert(f);
        read(t...);
    }

    int read_unsafe() { return 0; }
    template <class H, class... T> int read_unsafe(H& h, T&... t) {
        bool f = read_single(h);
        if (!f) return 0;
        return 1 + read_unsafe(t...);
    }

    int close() { return ::close(fd); }

  private:
    static constexpr int SIZE = 1 << 15;

    int fd = -1;
    std::array<char, SIZE + 1> line;
    int st = 0, ed = 0;
    bool eof = false;

    bool read_single(std::string& ref) {
        if (!skip_space()) return false;
        ref = "";
        while (true) {
            char c = top();
            if (c <= ' ') break;
            ref += c;
            st++;
        }
        return true;
    }
    bool read_single(double& ref) {
        std::string s;
        if (!read_single(s)) return false;
        ref = std::stod(s);
        return true;
    }

    template <class T,
              std::enable_if_t<std::is_same<T, char>::value>* = nullptr>
    bool read_single(T& ref) {
        if (!skip_space<50>()) return false;
        ref = top();
        st++;
        return true;
    }

    template <class T,
              internal::is_signed_int_t<T>* = nullptr,
              std::enable_if_t<!std::is_same<T, char>::value>* = nullptr>
    bool read_single(T& sref) {
        using U = internal::to_unsigned_t<T>;
        if (!skip_space<50>()) return false;
        bool neg = false;
        if (line[st] == '-') {
            neg = true;
            st++;
        }
        U ref = 0;
        do {
            ref = 10 * ref + (line[st++] & 0x0f);
        } while (line[st] >= '0');
        sref = neg ? -ref : ref;
        return true;
    }
    template <class U,
              internal::is_unsigned_int_t<U>* = nullptr,
              std::enable_if_t<!std::is_same<U, char>::value>* = nullptr>
    bool read_single(U& ref) {
        if (!skip_space<50>()) return false;
        ref = 0;
        do {
            ref = 10 * ref + (line[st++] & 0x0f);
        } while (line[st] >= '0');
        return true;
    }

    bool reread() {
        if (ed - st >= 50) return true;
        if (st > SIZE / 2) {
            std::memmove(line.data(), line.data() + st, ed - st);
            ed -= st;
            st = 0;
        }
        if (eof) return false;
        auto u = ::read(fd, line.data() + ed, SIZE - ed);
        if (u == 0) {
            eof = true;
            line[ed] = '\0';
            u = 1;
        }
        ed += int(u);
        line[ed] = char(127);
        return true;
    }

    char top() {
        if (st == ed) {
            bool f = reread();
            assert(f);
        }
        return line[st];
    }

    template <int TOKEN_LEN = 0> bool skip_space() {
        while (true) {
            while (line[st] <= ' ') st++;
            if (ed - st > TOKEN_LEN) return true;
            if (st > ed) st = ed;
            for (auto i = st; i < ed; i++) {
                if (line[i] <= ' ') return true;
            }
            if (!reread()) return false;
        }
    }
};

struct Printer {
  public:
    template <char sep = ' ', bool F = false> void write() {}
    template <char sep = ' ', bool F = false, class H, class... T>
    void write(const H& h, const T&... t) {
        if (F) write_single(sep);
        write_single(h);
        write<true>(t...);
    }
    template <char sep = ' ', class... T> void writeln(const T&... t) {
        write<sep>(t...);
        write_single('\n');
    }

    Printer(FILE* _fp) : fd(fileno(_fp)) {}
    ~Printer() { flush(); }

    int close() {
        flush();
        return ::close(fd);
    }

    void flush() {
        if (pos) {
            auto res = ::write(fd, line.data(), pos);
            assert(res != -1);
            pos = 0;
        }
    }

  private:
    static std::array<std::array<char, 2>, 100> small;
    static std::array<unsigned long long, 20> tens;

    static constexpr size_t SIZE = 1 << 15;
    int fd;
    std::array<char, SIZE> line;
    size_t pos = 0;
    std::stringstream ss;

    template <class T,
              std::enable_if_t<std::is_same<char, T>::value>* = nullptr>
    void write_single(const T& val) {
        if (pos == SIZE) flush();
        line[pos++] = val;
    }

    template <class T,
              internal::is_signed_int_t<T>* = nullptr,
              std::enable_if_t<!std::is_same<char, T>::value>* = nullptr>
    void write_single(const T& val) {
        using U = internal::to_unsigned_t<T>;
        if (val == 0) {
            write_single('0');
            return;
        }
        if (pos > SIZE - 50) flush();
        U uval = val;
        if (val < 0) {
            write_single('-');
            uval = -uval;
        }
        write_unsigned(uval);
    }

    template <class U,
              internal::is_unsigned_int_t<U>* = nullptr,
              std::enable_if_t<!std::is_same<char, U>::value>* = nullptr>
    void write_single(U uval) {
        if (uval == 0) {
            write_single('0');
            return;
        }
        if (pos > SIZE - 50) flush();

        write_unsigned(uval);
    }

    static int calc_len(uint64_t x) {
        int i = ((63 - std::countl_zero(x)) * 3 + 3) / 10;
        if (x < tens[i])
            return i;
        else
            return i + 1;
    }

    template <class U,
              internal::is_unsigned_int_t<U>* = nullptr,
              std::enable_if_t<2 >= sizeof(U)>* = nullptr>
    void write_unsigned(U uval) {
        size_t len = calc_len(uval);
        pos += len;

        char* ptr = line.data() + pos;
        while (uval >= 100) {
            ptr -= 2;
            memcpy(ptr, small[uval % 100].data(), 2);
            uval /= 100;
        }
        if (uval >= 10) {
            memcpy(ptr - 2, small[uval].data(), 2);
        } else {
            *(ptr - 1) = char('0' + uval);
        }
    }

    template <class U,
              internal::is_unsigned_int_t<U>* = nullptr,
              std::enable_if_t<4 == sizeof(U)>* = nullptr>
    void write_unsigned(U uval) {
        std::array<char, 8> buf;
        memcpy(buf.data() + 6, small[uval % 100].data(), 2);
        memcpy(buf.data() + 4, small[uval / 100 % 100].data(), 2);
        memcpy(buf.data() + 2, small[uval / 10000 % 100].data(), 2);
        memcpy(buf.data() + 0, small[uval / 1000000 % 100].data(), 2);

        if (uval >= 100000000) {
            if (uval >= 1000000000) {
                memcpy(line.data() + pos, small[uval / 100000000 % 100].data(),
                       2);
                pos += 2;
            } else {
                line[pos] = char('0' + uval / 100000000);
                pos++;
            }
            memcpy(line.data() + pos, buf.data(), 8);
            pos += 8;
        } else {
            size_t len = calc_len(uval);
            memcpy(line.data() + pos, buf.data() + (8 - len), len);
            pos += len;
        }
    }

    template <class U,
              internal::is_unsigned_int_t<U>* = nullptr,
              std::enable_if_t<8 == sizeof(U)>* = nullptr>
    void write_unsigned(U uval) {
        size_t len = calc_len(uval);
        pos += len;

        char* ptr = line.data() + pos;
        while (uval >= 100) {
            ptr -= 2;
            memcpy(ptr, small[uval % 100].data(), 2);
            uval /= 100;
        }
        if (uval >= 10) {
            memcpy(ptr - 2, small[uval].data(), 2);
        } else {
            *(ptr - 1) = char('0' + uval);
        }
    }

    template <
        class U,
        std::enable_if_t<internal::is_unsigned_int128<U>::value>* = nullptr>
    void write_unsigned(U uval) {
        static std::array<char, 50> buf;
        size_t len = 0;
        while (uval > 0) {
            buf[len++] = char((uval % 10) + '0');
            uval /= 10;
        }
        std::reverse(buf.begin(), buf.begin() + len);
        memcpy(line.data() + pos, buf.data(), len);
        pos += len;
    }

    void write_single(const std::string& s) {
        for (char c : s) write_single(c);
    }
    void write_single(const char* s) {
        size_t len = strlen(s);
        for (size_t i = 0; i < len; i++) write_single(s[i]);
    }
    template <class T> void write_single(const std::vector<T>& val) {
        auto n = val.size();
        for (size_t i = 0; i < n; i++) {
            if (i) write_single(' ');
            write_single(val[i]);
        }
    }
};

inline std::array<std::array<char, 2>, 100> Printer::small = [] {
    std::array<std::array<char, 2>, 100> table;
    for (int i = 0; i <= 99; i++) {
        table[i][1] = char('0' + (i % 10));
        table[i][0] = char('0' + (i / 10 % 10));
    }
    return table;
}();
inline std::array<unsigned long long, 20> Printer::tens = [] {
    std::array<unsigned long long, 20> table;
    for (int i = 0; i < 20; i++) {
        table[i] = 1;
        for (int j = 0; j < i; j++) {
            table[i] *= 10;
        }
    }
    return table;
}();

}  // namespace yosupo

#include <concepts>
#include <cstdlib>


#include <cmath>



namespace yosupo {

using i8 = int8_t;
using u8 = uint8_t;
using i16 = int16_t;
using u16 = uint16_t;
using i32 = int32_t;
using u32 = uint32_t;
using i64 = int64_t;
using u64 = uint64_t;
using i128 = __int128;
using u128 = unsigned __int128;

using f32 = float;
using f64 = double;

}  // namespace yosupo

namespace yosupo {
// x * inv_u32(x) = 1 (mod 2^32)
inline constexpr u32 inv_u32(const u32 x) {
    u32 inv = 1;
    for (int i = 0; i < 5; i++) {
        inv *= 2u - inv * x;
    }
    return inv;
}

inline constexpr u64 pow_mod_u64(u64 x, u64 n, u64 m) {
    if (m == 1) return 0;
    u64 r = 1;
    u64 y = x % m;
    while (n) {
        if (n & 1) r = (u64)((u128)(1) * r * y % m);
        y = (u64)((u128)(1) * y * y % m);
        n >>= 1;
    }
    return r;
}

inline u64 isqrt(u64 a) {
    if (a == u64(-1)) return (u64(1) << 32) - 1;
    u64 x = (u64)std::sqrt((double)a);
    return x * x > a ? x - 1 : x;
}

// integer k-th root: floor(a^(1/k))
inline u64 iroot(u64 a, int k) {
    if (k == 2) return isqrt(a);
    if (k == 1 || a == 0) return a;
    if (k >= 64 || a < (u64(1) << k)) return 1;

    // is (b^k <= a)?
    auto check = [&](u64 b) {
        u128 result = 1;
        for (int i = 0; i < k; i++) {
            result *= b;
            if (result > a) return false;
        }
        return true;
    };

    // lw^k <= a < up^k
    u64 lw = 2;
    u64 up = 1LL << ((64 + k - 1) / k);
    while (up - lw > 1) {
        u64 mid = (lw + up) / 2;
        if (check(mid)) {
            lw = mid;
        } else {
            up = mid;
        }
    }

    return lw;
}

}  // namespace yosupo

#include <initializer_list>
#include <limits>


namespace yosupo {

inline constexpr bool is_prime(u32 n) {
    if (n == 2) return true;
    if (n % 2 == 0) return false;
    u64 d = n - 1;
    while (d % 2 == 0) d /= 2;
    for (u64 a : {2, 7, 61}) {
        if (a % n == 0) return true;
        u64 t = d;
        u64 y = pow_mod_u64(a, t, n);
        while (t != n - 1 && y != 1 && y != n - 1) {
            y = (u64)((u128)(1) * y * y % n);
            t <<= 1;
        }
        if (y != n - 1 && t % 2 == 0) {
            return false;
        }
    }
    return true;
}
inline constexpr bool is_prime(u64 n) {
    if (n <= std::numeric_limits<u32>::max()) {
        return is_prime((u32)n);
    }
    if (n % 2 == 0) return false;
    u64 d = n - 1;
    while (d % 2 == 0) d /= 2;
    for (u64 a : {2, 325, 9375, 28178, 450775, 9780504, 1795265022}) {
        if (a % n == 0) return true;
        u64 t = d;
        u64 y = pow_mod_u64(a, t, n);
        while (t != n - 1 && y != 1 && y != n - 1) {
            y = (u64)((u128)(1) * y * y % n);
            t <<= 1;
        }
        if (y != n - 1 && t % 2 == 0) {
            return false;
        }
    }
    return true;
}
inline bool is_prime(i32 n) {
    if (n <= 1) return false;
    return is_prime((u32)n);
}
inline bool is_prime(i64 n) {
    if (n <= 1) return false;
    return is_prime((u64)n);
}

// lpf_table[i] = the smallest prime factor of i
inline std::vector<i32> lpf_table(i32 n) {
    std::vector<i32> table(n + 1, -1);
    std::vector<int> primes;
    for (int d = 2; d <= n; d++) {
        if (table[d] == -1) {
            primes.push_back(d);
            table[d] = d;
        }
        for (int p : primes) {
            if (p * d > n || p > table[d]) break;
            table[p * d] = p;
        }
    }
    return table;
}

// enumerate primes no more than n
inline std::vector<i32> primes(i32 n) {
    std::vector<i32> table = lpf_table(n);
    std::vector<i32> p;
    for (int i = 2; i <= n; i++) {
        if (table[i] == i) p.push_back(i);
    }
    return p;
}

}  // namespace yosupo

namespace yosupo {

template <i32 MOD>
    requires requires {
        MOD % 2 == 1 && 1 <= MOD&& MOD <= (1 << 30) - 1;
        is_prime(MOD);
    }
struct ModInt {
    static constexpr i32 mod() { return MOD; }

    constexpr ModInt() : x(0) {}

    // It's OK because _x * B2 < 2**32 * MOD
    constexpr ModInt(u32 _x) : x(reduce_mul(_x, B2)) {}

    constexpr ModInt(std::signed_integral auto _x)
        : ModInt((u32)(_x % MOD + MOD)) {}
    constexpr ModInt(std::unsigned_integral auto _x)
        : ModInt((u32)(_x % MOD)) {}

    constexpr i32 val() const {
        u32 y = reduce_mul(x, 1);
        return y < MOD ? y : y - MOD;
    }

    constexpr ModInt operator+() const { return *this; }
    constexpr ModInt operator-() const { return ModInt() -= *this; }

    constexpr ModInt& operator+=(const ModInt& rhs) {
        x += rhs.x;
        x = std::min(x, x - 2 * MOD);
        return *this;
    }
    constexpr friend ModInt operator+(const ModInt& lhs, const ModInt& rhs) {
        return ModInt(lhs) += rhs;
    }

    constexpr ModInt& operator-=(const ModInt& rhs) {
        x += 2 * MOD - rhs.x;
        x = std::min(x, x - 2 * MOD);
        return *this;
    }
    constexpr friend ModInt operator-(const ModInt& lhs, const ModInt& rhs) {
        return ModInt(lhs) -= rhs;
    }

    constexpr ModInt& operator*=(const ModInt& rhs) {
        x = reduce_mul(x, rhs.x);
        return *this;
    }
    constexpr friend ModInt operator*(const ModInt& lhs, const ModInt& rhs) {
        return ModInt(lhs) *= rhs;
    }

    constexpr ModInt& operator/=(const ModInt& rhs) {
        return *this *= rhs.inv();
    }
    constexpr friend ModInt operator/(const ModInt& lhs, const ModInt& rhs) {
        return ModInt(lhs) /= rhs;
    }

    friend bool operator==(const ModInt& lhs, const ModInt& rhs) {
        return std::min(lhs.x, lhs.x - MOD) == std::min(rhs.x, rhs.x - MOD);
    }

    constexpr ModInt pow(u64 n) const {
        ModInt v = *this, r = 1;
        while (n) {
            if (n & 1) r *= v;
            v *= v;
            n >>= 1;
        }
        return r;
    }
    constexpr ModInt inv() const { return pow(MOD - 2); }

    std::string dump() const { return std::to_string(val()); }

  private:
    u32 x;  // [0, 2 * MOD)

    static constexpr u32 B = ((u64(1) << 32)) % MOD;
    static constexpr u32 B2 = u64(1) * B * B % MOD;
    static constexpr u32 INV = -inv_u32(MOD);

    // Input: (l * r) must be no more than (2^32 * MOD)
    // Output: ((l * r) / 2^32) % MOD
    static constexpr u32 reduce_mul(u32 l, u32 r) {
        u64 x = u64(1) * l * r;
        x += u64(u32(x) * INV) * MOD;
        return u32(x >> 32);
    }
};

using ModInt998244353 = ModInt<998244353>;
using ModInt1000000007 = ModInt<1000000007>;

}  // namespace yosupo

#include <cstddef>
#include <functional>
#include <queue>
#include <utility>


#include <immintrin.h>

#include <span>




namespace yosupo {
inline constexpr u32 smallest_primitive_root(u32 m) {
    if (m == 2) return 1;

    u32 divs[20] = {};
    int cnt = 0;
    u32 x = (m - 1) / 2;
    for (int i = 2; (u64)(i)*i <= x; i += 2) {
        if (x % i == 0) {
            divs[cnt++] = i;
            while (x % i == 0) {
                x /= i;
            }
        }
    }
    if (x > 1) {
        divs[cnt++] = x;
    }
    for (u32 g = 2;; g++) {
        bool ok = true;
        for (int i = 0; i < cnt; i++) {
            if (pow_mod_u64(g, (m - 1) / divs[i], m) == 1) {
                ok = false;
                break;
            }
        }
        if (ok) return g;
    }
}
}  // namespace yosupo

#include <xmmintrin.h>



#include <map>
#include <set>
#include <tuple>


namespace yosupo {

inline std::string dump(const std::string& t) { return t; }
inline std::string dump(const char* t) { return t; }

template <std::integral T> std::string dump(T t) { return std::to_string(t); }

inline std::string dump(const u128& t) {
    if (t == 0) {
        return "0";
    }

    std::string s;
    u128 x = t;
    while (x) {
        s += char(x % 10 + '0');
        x /= 10;
    }
    std::ranges::reverse(s);
    return s;
}

inline std::string dump(const i128& t) {
    if (t < 0) {
        return "-" + dump((u128)(-t));
    } else {
        return dump((u128)(t));
    }
}

template <std::floating_point T> std::string dump(T t) {
    return std::to_string(t);
}

template <class T>
    requires requires(T t) { t.dump(); }
std::string dump(T t);
template <class T>
    requires(!requires(T t) { t.dump(); }) && (requires(T t) { t.val(); })
std::string dump(T t);

template <class T, std::size_t N> std::string dump(const std::array<T, N>&);
template <class T> std::string dump(const std::vector<T>&);
template <class T1, class T2> std::string dump(const std::pair<T1, T2>&);
template <class K, class V> std::string dump(const std::map<K, V>&);
template <class T> std::string dump(const std::set<T>&);
template <class... Ts> std::string dump(const std::tuple<Ts...>& t);

template <class T>
    requires requires(T t) { t.dump(); }
std::string dump(T t) {
    return dump(t.dump());
}

template <class T>
    requires(!requires(T t) { t.dump(); }) && (requires(T t) { t.val(); })
std::string dump(T t) {
    return dump(t.val());
}

template <class T, std::size_t N> std::string dump(const std::array<T, N>& a) {
    std::string s = "[";
    for (size_t i = 0; i < N; i++) {
        if (i) {
            s += ", ";
        }
        s += dump(a[i]);
    }
    s += "]";
    return s;
}

template <class T> std::string dump(const std::vector<T>& v) {
    std::string s = "[";
    for (std::size_t i = 0; i < v.size(); ++i) {
        s += dump(v[i]);
        if (i + 1 != v.size()) {
            s += ", ";
        }
    }
    s += "]";
    return s;
}

template <class T1, class T2> std::string dump(const std::pair<T1, T2>& p) {
    std::string s = "(";
    s += dump(p.first);
    s += ", ";
    s += dump(p.second);
    s += ")";
    return s;
}

template <class K, class V> std::string dump(const std::map<K, V>& m) {
    std::string s = "{";
    for (auto it = m.begin(); it != m.end(); ++it) {
        if (it != m.begin()) {
            s += ", ";
        }
        s += dump(it->first);
        s += ": ";
        s += dump(it->second);
    }
    s += "}";
    return s;
}

template <class T> std::string dump(const std::set<T>& s) {
    std::string str = "{";
    for (auto it = s.begin(); it != s.end(); ++it) {
        if (it != s.begin()) {
            str += ", ";
        }
        str += dump(*it);
    }
    str += "}";
    return str;
}

template <class... Ts> std::string dump(const std::tuple<Ts...>& t) {
    std::string s = "(";
    [&]<std::size_t... I>(std::index_sequence<I...>) {
        ((s += dump(std::get<I>(t)) + ((I < sizeof...(Ts) - 1) ? ", " : "")),
         ...);
    }(std::make_index_sequence<sizeof...(Ts)>());
    s += ")";
    return s;
}

}  // namespace yosupo

namespace yosupo {

template <i32 MOD> struct ModInt8 {
  private:
    using modint = ModInt<MOD>;
    static_assert(sizeof(modint) == 4);

    alignas(32) std::array<modint, 8> x;

    using m256i_u = __m256i_u;
    __attribute__((target("avx2"))) m256i_u to_m256i_u() const {
        return _mm256_loadu_si256((m256i_u*)x.data());
    }
    __attribute__((target("avx2"))) explicit ModInt8(m256i_u _x) {
        _mm256_storeu_si256((m256i_u*)x.data(), _x);
    }

  public:
    __attribute__((target("avx2"))) ModInt8() : x() {}
    __attribute__((target("avx2"))) explicit ModInt8(modint _x) { x.fill(_x); }

    __attribute__((target("avx2"))) ModInt8(std::span<const modint, 8> _x) {
        std::copy_n(_x.begin(), 8, x.begin());
    }
    __attribute__((target("avx2"))) ModInt8(modint x0,
                                            modint x1,
                                            modint x2,
                                            modint x3,
                                            modint x4,
                                            modint x5,
                                            modint x6,
                                            modint x7)
        : ModInt8(std::array<modint, 8>{x0, x1, x2, x3, x4, x5, x6, x7}) {}

    __attribute__((target("avx2"))) std::array<u32, 8> val() const {
        auto _x = to_m256i_u();
        auto a = reduce_mul(_x, _mm256_set1_epi32(1));
        alignas(32) std::array<u32, 8> b;
        _mm256_storeu_si256((__m256i_u*)b.data(),
                            min(a, _mm256_sub_epi32(a, MOD_X())));
        return b;
    }

    __attribute__((target("avx2"))) std::array<modint, 8> to_array() const {
        return x;
    }

    __attribute__((target("avx2"))) friend ModInt8 operator+(
        const ModInt8& lhs,
        const ModInt8& rhs) {
        auto _x = _mm256_add_epi32(lhs.to_m256i_u(), rhs.to_m256i_u());
        return ModInt8(min(_x, _mm256_sub_epi32(_x, MOD2_X())));
    }
    __attribute__((target("avx2"))) ModInt8& operator+=(const ModInt8& rhs) {
        return *this = *this + rhs;
    }

    __attribute__((target("avx2"))) friend ModInt8 operator-(
        const ModInt8& lhs,
        const ModInt8& rhs) {
        auto _x = _mm256_sub_epi32(lhs.to_m256i_u(), rhs.to_m256i_u());
        return ModInt8(min(_x, _mm256_add_epi32(_x, MOD2_X())));
    }
    __attribute__((target("avx2"))) ModInt8& operator-=(const ModInt8& rhs) {
        return *this = *this - rhs;
    }

    __attribute__((target("avx2"))) friend ModInt8 operator*(
        const ModInt8& lhs,
        const ModInt8& rhs) {
        return ModInt8(reduce_mul(lhs.to_m256i_u(), rhs.to_m256i_u()));
    }

    __attribute__((target("avx2"))) ModInt8& operator*=(const ModInt8& rhs) {
        return *this = *this * rhs;
    }

    __attribute__((target("avx2"))) ModInt8 operator-() const {
        return ModInt8() - *this;
    }

    __attribute__((target("avx2"))) friend bool operator==(const ModInt8& lhs,
                                                           const ModInt8& rhs) {
        auto _x = lhs.to_m256i_u();
        auto _y = rhs.to_m256i_u();
        _x = min(_x, _mm256_sub_epi32(_x, MOD_X()));
        _y = min(_y, _mm256_sub_epi32(_y, MOD_X()));
        auto z = _mm256_xor_si256(_x, _y);
        return _mm256_testz_si256(z, z);
    }

    // a.permutevar(idx)[i] = a[idx[i] % 8]
    __attribute__((target("avx2"))) ModInt8
    permutevar(const std::array<u32, 8>& idx) const {
        return ModInt8(_mm256_permutevar8x32_epi32(
            to_m256i_u(), _mm256_loadu_si256((const m256i_u*)idx.data())));
    }

    template <u8 MASK>
    __attribute__((target("avx2"))) friend ModInt8 blend(const ModInt8& lhs,
                                                         const ModInt8& rhs) {
        return ModInt8(
            _mm256_blend_epi32(lhs.to_m256i_u(), rhs.to_m256i_u(), MASK));
    }

    __attribute__((target("avx2"))) std::string dump() const {
        return yosupo::dump(val());
    }

  private:
    static constexpr u32 B2 = pow_mod_u64(2, 64, MOD);
    static constexpr u32 INV = -inv_u32(MOD);

    __attribute__((target("avx2"))) static m256i_u MOD_X() {
        return _mm256_set1_epi32(MOD);
    }
    __attribute__((target("avx2"))) static m256i_u MOD2_X() {
        return _mm256_set1_epi32(2 * MOD);
    }
    __attribute__((target("avx2"))) static m256i_u N_INV_X() {
        return _mm256_set1_epi32(INV);
    }

    // Input: l * r <= 2^32 * MOD
    // Output: l * r >>= 2^32
    __attribute__((target("avx2"))) static m256i_u reduce_mul(
        const m256i_u& l,
        const m256i_u& r) {
        auto x0 = mul_even(l, r);
        auto x1 = mul_even(_mm256_shuffle_epi32(l, 0xf5),
                           _mm256_shuffle_epi32(r, 0xf5));
        x0 += mul_even(mul_even(x0, N_INV_X()), MOD_X());
        x1 += mul_even(mul_even(x1, N_INV_X()), MOD_X());

        return _mm256_blend_epi32(_mm256_srli_epi64(x0, 32), x1, 0b10101010);
    }

    /* SIMD utils */
    __attribute__((target("avx2"))) static __m256i_u min(const __m256i_u& l,
                                                         const __m256i_u& r) {
        return _mm256_min_epu32(l, r);
    }
    __attribute__((target("avx2"))) static __m256i_u max(const __m256i_u& l,
                                                         const __m256i_u& r) {
        return _mm256_max_epu32(l, r);
    }
    // (lr[0], lr[2], lr[4], lr[6])
    __attribute__((target("avx2"))) static __m256i_u mul_even(
        const __m256i_u& l,
        const __m256i_u& r) {
        return _mm256_mul_epu32(l, r);
    }
};

}  // namespace yosupo

#include <ranges>

namespace yosupo {

template <class T> bool chmin(T& a, const T& b) {
    if (a > b) {
        a = b;
        return true;
    }
    return false;
}

template <class T> bool chmax(T& a, const T& b) {
    if (a < b) {
        a = b;
        return true;
    }
    return false;
}

template <class T> T floor_div(T x, T y) {
    auto d = x / y;
    auto r = x % y;
    if (r == 0) return d;
    if ((r > 0) == (y > 0)) return d;
    return d - 1;
}
template <class T> T ceil_div(T x, T y) {
    auto d = x / y;
    auto r = x % y;
    if (r == 0) return d;
    if ((r > 0) == (y > 0)) return d + 1;
    return d;
}

template <std::ranges::input_range R>
std::vector<std::ranges::range_value_t<R>> to_vec(R&& r) {
    auto common = r | std::views::common;
    return std::vector(common.begin(), common.end());
}

template <class T, class Comp = std::equal_to<>>
void dedup(std::vector<T>& v, Comp comp = Comp{}) {
    auto it = std::ranges::unique(v, comp);
    v.erase(it.begin(), it.end());
}

template <size_t N, class T> std::span<T, N> subspan(std::span<T> a, int idx) {
    return a.subspan(idx).template first<N>();
}

inline auto rep(int l, int r) {
    if (l > r) return std::views::iota(l, l);
    return std::views::iota(l, r);
}

}  // namespace yosupo

namespace yosupo {

template <i32 MOD> struct FFTInfo {
    using modint = ModInt<MOD>;
    using modint8 = ModInt8<MOD>;

    static constexpr u32 g = smallest_primitive_root(MOD);
    static constexpr int ord2 = std::countr_zero((u32)(MOD - 1));

    // w[i]^(2^i) == 1 : w[i] is the fft omega of n=2^i
    static constexpr std::array<modint, ord2 + 1> w = []() {
        std::array<modint, ord2 + 1> v;
        v[ord2] = modint(g).pow((MOD - 1) >> ord2);
        for (int i = ord2 - 1; i >= 0; i--) {
            v[i] = v[i + 1] * v[i + 1];
        }
        return v;
    }();
    static constexpr std::array<modint, ord2 + 1> iw = []() {
        std::array<modint, ord2 + 1> v;
        v[ord2] = w[ord2].inv();
        for (int i = ord2 - 1; i >= 0; i--) {
            v[i] = v[i + 1] * v[i + 1];
        }
        return v;
    }();

    static constexpr std::array<modint, ord2 + 1> rot8 = []() {
        std::array<modint, std::max(0, ord2 + 1)> v;
        for (int i = 3; i <= ord2; i++) {
            v[i] = w[i];
            for (int j = 3; j < i; j++) {
                v[i] *= iw[j];
            }
        }
        return v;
    }();
    static constexpr std::array<modint, ord2 + 1> irot8 = []() {
        std::array<modint, std::max(0, ord2 + 1)> v;
        for (int i = 3; i <= ord2; i++) {
            v[i] = iw[i];
            for (int j = 3; j < i; j++) {
                v[i] *= w[j];
            }
        }
        return v;
    }();
    // rot[i] * rot_shift8(i) = rot[i + 8]
    modint rot_shift8(u32 i) const { return rot8[std::countr_one(i >> 3) + 3]; }
    modint irot_shift8(u32 i) const {
        return irot8[std::countr_one(i >> 3) + 3];
    }

    static constexpr std::array<modint, ord2 + 1> rot4 = []() {
        std::array<modint, std::max(0, ord2 + 1)> v;
        for (int i = 4; i <= ord2; i++) {
            v[i] = w[i];
            for (int j = 4; j < i; j++) {
                v[i] *= iw[j];
            }
        }
        return v;
    }();
    static constexpr std::array<modint, ord2 + 1> irot4 = []() {
        std::array<modint, std::max(0, ord2 + 1)> v;
        for (int i = 4; i <= ord2; i++) {
            v[i] = iw[i];
            for (int j = 4; j < i; j++) {
                v[i] *= w[j];
            }
        }
        return v;
    }();
    std::array<std::array<modint, 8>, ord2 + 1> rot16i = []() {
        std::array<std::array<modint, 8>, ord2 + 1> v;
        for (int i = 4; i <= ord2; i++) {
            std::array<modint, 8> buf;
            buf[0] = 1;
            for (int j = 1; j < 8; j++) {
                buf[j] = buf[j - 1] * rot4[i];
            }
            v[i] = buf;
        }
        return v;
    }();
    std::array<std::array<modint, 8>, ord2 + 1> irot16i = []() {
        std::array<std::array<modint, 8>, ord2 + 1> v;
        for (int i = 4; i <= ord2; i++) {
            std::array<modint, 8> buf;
            buf[0] = 1;
            for (int j = 1; j < 8; j++) {
                buf[j] = buf[j - 1] * irot4[i];
            }
            v[i] = buf;
        }
        return v;
    }();
    // rot[i * j] * rot_shift16i(i)[j] = rot[(i + 8) * j]
    __attribute__((target("avx2"))) modint8 rot_shift16i(u32 i) const {
        return modint8(rot16i[std::countr_one(i >> 4) + 4]);
    }
    __attribute__((target("avx2"))) modint8 irot_shift16i(u32 i) const {
        return modint8(irot16i[std::countr_one(i >> 4) + 4]);
    }
};
template <i32 MOD> const FFTInfo<MOD> fft_info = FFTInfo<MOD>();

template <i32 MOD>
__attribute__((target("avx2"))) void butterfly(std::vector<ModInt<MOD>>& a) {
    const int n = int(a.size());
    const int lg = std::countr_zero((u32)n);
    assert(n == (1 << lg));

    using modint8 = ModInt8<MOD>;
    const FFTInfo<MOD>& info = fft_info<MOD>;

    if (n == 1) {
        return;
    }
    if (n == 2) {
        auto a0 = a[0], a1 = a[1];
        a[0] = a0 + a1;
        a[1] = a0 - a1;
        return;
    }
    if (n == 4) {
        auto a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3];
        auto x = (a1 - a3) * info.w[2];

        a[0] = (a0 + a2) + (a1 + a3);
        a[1] = (a0 + a2) - (a1 + a3);
        a[2] = (a0 - a2) + x;
        a[3] = (a0 - a2) - x;
        return;
    }

    int h = lg;
    if (h % 2 == 0) {
        // 2-base
        int len = n / 2;
        for (int i = 0; i < len; i += 8) {
            auto l = modint8(subspan<8>(std::span{a}, 0 * len + i));
            auto r = modint8(subspan<8>(std::span{a}, 1 * len + i));

            std::copy_n((l + r).to_array().data(), 8,
                        subspan<8>(std::span{a}, 0 * len + i).data());
            std::copy_n((l - r).to_array().data(), 8,
                        subspan<8>(std::span{a}, 1 * len + i).data());
        }
        h--;
    }

    while (h >= 5) {
        // 4-base
        const modint8 w2x = modint8(info.w[2]);

        modint8 rotx = modint8(1);
        for (int start = 0; start < n; start += (1 << h)) {
            const modint8 rot2x = rotx * rotx;
            const modint8 rot3x = rot2x * rotx;

            int len = 1 << (h - 2);
            for (int i = 0; i < len; i += 8) {
                auto a0 =
                    modint8(subspan<8>(std::span{a}, start + 0 * len + i));
                auto a1 =
                    modint8(subspan<8>(std::span{a}, start + 1 * len + i)) *
                    rotx;
                auto a2 =
                    modint8(subspan<8>(std::span{a}, start + 2 * len + i)) *
                    rot2x;
                auto a3 =
                    modint8(subspan<8>(std::span{a}, start + 3 * len + i)) *
                    rot3x;

                auto x = (a1 - a3) * w2x;

                std::copy_n(
                    (a0 + a2 + a1 + a3).to_array().data(), 8,
                    subspan<8>(std::span{a}, start + 0 * len + i).data());
                std::copy_n(
                    (a0 + a2 - a1 - a3).to_array().data(), 8,
                    subspan<8>(std::span{a}, start + 1 * len + i).data());
                std::copy_n(
                    (a0 - a2 + x).to_array().data(), 8,
                    subspan<8>(std::span{a}, start + 2 * len + i).data());
                std::copy_n(
                    (a0 - a2 - x).to_array().data(), 8,
                    subspan<8>(std::span{a}, start + 3 * len + i).data());
            }
            rotx *= modint8(info.rot_shift8(8 * (start >> h)));
        }
        h -= 2;
    }

    {
        // fft each element
        const ModInt<MOD> w4 = info.w[2], w8 = info.w[3];
        const auto step4 = ModInt8<MOD>(1, 1, 1, w4, 1, 1, 1, w4);
        const auto step8 =
            ModInt8<MOD>(1, 1, 1, 1, 1, w8, w8 * w8, w8 * w8 * w8);

        modint8 rotxi = modint8(1);
        for (int i = 0; i < n; i += 8) {
            auto x = modint8(subspan<8>(std::span{a}, i)) * rotxi;
            x = (blend<0b11110000>(x, -x) +
                 x.permutevar({4, 5, 6, 7, 0, 1, 2, 3})) *
                step8;
            x = (blend<0b11001100>(x, -x) +
                 x.permutevar({2, 3, 0, 1, 6, 7, 4, 5})) *
                step4;
            x = (blend<0b10101010>(x, -x) +
                 x.permutevar({1, 0, 3, 2, 5, 4, 7, 6}));
            std::copy_n(x.to_array().data(), 8,
                        subspan<8>(std::span{a}, i).data());

            if (i + 8 < n) rotxi *= info.rot_shift16i(2 * i);
        }
    }
}

template <i32 MOD>
__attribute__((target("avx2"))) void butterfly_inv(
    std::vector<ModInt<MOD>>& a) {
    const int n = int(a.size());
    const int lg = std::countr_zero((u32)n);
    assert(n == (1 << lg));

    using modint8 = ModInt8<MOD>;
    const FFTInfo<MOD>& info = fft_info<MOD>;

    if (n == 1) {
        return;
    }
    if (n == 2) {
        auto a0 = a[0], a1 = a[1];
        a[0] = a0 + a1;
        a[1] = a0 - a1;
        return;
    }
    if (n == 4) {
        auto a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3];
        auto x0 = a0 + a1;
        auto x1 = a0 - a1;
        auto x2 = a2 + a3;
        auto x3 = (a2 - a3) * info.iw[2];

        a[0] = x0 + x2;
        a[1] = x1 + x3;
        a[2] = x0 - x2;
        a[3] = x1 - x3;
        return;
    }

    {
        // ifft each element
        const ModInt<MOD> iw4 = info.iw[2], iw8 = info.iw[3];
        const auto step4 = ModInt8<MOD>(1, 1, 1, iw4, 1, 1, 1, iw4);
        const auto step8 =
            ModInt8<MOD>(1, 1, 1, 1, 1, iw8, iw8 * iw8, iw8 * iw8 * iw8);
        modint8 irotxi = modint8(1);
        for (int i = 0; i < n; i += 8) {
            auto x = modint8(subspan<8>(std::span{a}, i));
            x = (blend<0b10101010>(x, -x) +
                 x.permutevar({1, 0, 3, 2, 5, 4, 7, 6})) *
                step4;
            x = (blend<0b11001100>(x, -x) +
                 x.permutevar({2, 3, 0, 1, 6, 7, 4, 5})) *
                step8;
            x = (blend<0b11110000>(x, -x) +
                 x.permutevar({4, 5, 6, 7, 0, 1, 2, 3}));

            std::copy_n((x * irotxi).to_array().begin(), 8,
                        subspan<8>(std::span{a}, i).begin());

            if (i + 8 < n) irotxi *= info.irot_shift16i(2 * i);
        }
    }

    int h = 3;
    while (h + 2 <= lg) {
        h += 2;

        // 4-base
        const modint8 w2 = modint8(info.iw[2]);

        modint8 rotx = modint8(1);
        for (int start = 0; start < n; start += (1 << h)) {
            const auto rot2x = rotx * rotx;
            const auto rot3x = rot2x * rotx;
            int len = 1 << (h - 2);
            for (int i = 0; i < len; i += 8) {
                auto a0 =
                    modint8(subspan<8>(std::span{a}, start + 0 * len + i));
                auto a1 =
                    modint8(subspan<8>(std::span{a}, start + 1 * len + i));
                auto a2 =
                    modint8(subspan<8>(std::span{a}, start + 2 * len + i));
                auto a3 =
                    modint8(subspan<8>(std::span{a}, start + 3 * len + i));

                auto x0 = a0 + a1;
                auto x1 = a0 - a1;
                auto x2 = a2 + a3;
                auto x3 = (a2 - a3) * w2;

                std::copy_n(
                    (x0 + x2).to_array().begin(), 8,
                    subspan<8>(std::span{a}, start + 0 * len + i).begin());
                std::copy_n(
                    ((x1 + x3) * rotx).to_array().begin(), 8,
                    subspan<8>(std::span{a}, start + 1 * len + i).begin());
                std::copy_n(
                    ((x0 - x2) * rot2x).to_array().begin(), 8,
                    subspan<8>(std::span{a}, start + 2 * len + i).begin());
                std::copy_n(
                    ((x1 - x3) * rot3x).to_array().data(), 8,
                    subspan<8>(std::span{a}, start + 3 * len + i).begin());
            }
            rotx *= modint8(info.irot_shift8(8 * (start >> h)));
        }
    }

    if (h + 1 == lg) {
        // 2-base
        int len = n / 2;
        for (int i = 0; i < len; i += 8) {
            auto l = modint8(subspan<8>(std::span{a}, 0 * len + i));
            auto r = modint8(subspan<8>(std::span{a}, 1 * len + i));

            std::copy_n((l + r).to_array().begin(), 8,
                        subspan<8>(std::span{a}, 0 * len + i).begin());
            std::copy_n((l - r).to_array().begin(), 8,
                        subspan<8>(std::span{a}, 1 * len + i).begin());
        }
        h++;
    }
}

template <i32 MOD>
__attribute__((target("avx2"))) std::vector<ModInt<MOD>> convolution_fft(
    std::vector<ModInt<MOD>> a,
    std::vector<ModInt<MOD>> b) {
    if (a.empty() || b.empty()) return {};

    int n = int(a.size()), m = int(b.size());
    int z = (int)std::bit_ceil((unsigned int)(n + m - 1));

    a.resize(z);
    butterfly(a);
    b.resize(z);
    butterfly(b);
    for (int i = 0; i < z; i++) {
        a[i] *= b[i];
    }
    butterfly_inv(a);
    a.resize(n + m - 1);
    auto iz = ModInt<MOD>(z).inv();
    for (int i = 0; i < n + m - 1; i++) a[i] *= iz;
    return a;
}

template <i32 MOD>
__attribute__((target("avx2"))) std::vector<ModInt<MOD>> convolution_naive(
    const std::vector<ModInt<MOD>>& a,
    const std::vector<ModInt<MOD>>& b) {
    if (a.empty() || b.empty()) return {};

    int n = int(a.size()), m = int(b.size());
    int m8 = (m + 7) / 8 * 8;
    std::vector<ModInt<MOD>> ans(n + m8 - 1);
    using modint = ModInt<MOD>;
    using modint8 = ModInt8<MOD>;

    modint8 b_last = [&]() {
        std::array<modint, 8> b_vec = {};
        std::copy(b.begin() + m8 - 8, b.end(), b_vec.begin());
        return modint8(b_vec);
    }();

    for (int i = 0; i < n; ++i) {
        modint ai = a[i];
        modint8 ai_vec(ai);
        for (int j = 0; j < m8; j += 8) {
            modint8 b_vec =
                (j < m8 - 8) ? modint8(subspan<8>(std::span{b}, j)) : b_last;
            modint8 ans_vec(subspan<8>(std::span{ans}, i + j));
            modint8 prod = ai_vec * b_vec;
            modint8 updated_ans = ans_vec + prod;
            std::copy_n(updated_ans.to_array().data(), 8,
                        subspan<8>(std::span{ans}, i + j).data());
        }
    }

    ans.resize(n + m - 1);
    return ans;
}

template <i32 MOD>
__attribute__((target("avx2"))) std::vector<ModInt<MOD>> convolution(
    const std::vector<ModInt<MOD>>& a,
    const std::vector<ModInt<MOD>>& b) {
    if (a.empty() || b.empty()) return {};
    int n = int(a.size()), m = int(b.size());

    if (std::min(n, m) < 100) {
        if (n < m) return convolution_naive(a, b);
        return convolution_naive(b, a);
    }
    return convolution_fft(a, b);
}

}  // namespace yosupo

namespace yosupo {

template <i32 MOD> struct ModVec {
  private:
    using modint = ModInt<MOD>;
    using modint8 = ModInt8<MOD>;

  public:
    ModVec() {}
    explicit ModVec(size_t n) : v(n) {}
    ModVec(const std::vector<modint>& _v) : v(_v) {}
    ModVec(std::initializer_list<modint> init) : v(init) {}

    modint& operator[](size_t i) { return v[i]; }
    const modint& operator[](size_t i) const { return v[i]; }

    bool operator==(const ModVec& rhs) const { return v == rhs.v; }

    ModVec& operator+=(const ModVec& rhs) {
        if (size() < rhs.size()) v.resize(rhs.size());
        // TODO: simd
        for (int i = 0; i < std::ssize(rhs); i++) {
            v[i] += rhs.v[i];
        }
        return *this;
    }
    friend ModVec operator+(const ModVec& lhs, const ModVec& rhs) {
        return ModVec(lhs) += rhs;
    }
    ModVec& operator-=(const ModVec& rhs) {
        if (size() < rhs.size()) v.resize(rhs.size());
        // TODO: simd
        for (int i = 0; i < std::ssize(rhs); i++) {
            v[i] -= rhs.v[i];
        }
        return *this;
    }
    friend ModVec operator-(const ModVec& lhs, const ModVec& rhs) {
        return ModVec(lhs) -= rhs;
    }

    friend ModVec operator*(const ModVec& lhs, const ModVec& rhs) {
        return ModVec(convolution(lhs.v, rhs.v));
    }
    ModVec& operator*=(const ModVec& rhs) { return *this = *this * rhs; }

    size_t size() const { return v.size(); }
    void resize(size_t n) { v.resize(n); }

    modint& front() { return v.front(); }
    const modint& front() const { return v.front(); }
    modint& back() { return v.back(); }
    const modint& back() const { return v.back(); }

    void push_back(const modint& x) { v.push_back(x); }
    void pop_back() { v.pop_back(); }

    std::string dump() const { return ::yosupo::dump(v); }

    static ModVec prod(std::vector<ModVec> pols) {
        if (pols.empty()) return ModVec({modint(1)});
        if (pols.size() == 1) return pols[0];

        // sort by the first
        std::priority_queue<std::pair<int, int>,
                            std::vector<std::pair<int, int>>, std::greater<>>
            que;
        for (int i = 0; i < std::ssize(pols); i++) {
            que.emplace(pols[i].size(), i);
        }

        while (que.size() > 1) {
            auto [a, i] = que.top();
            que.pop();
            auto [b, j] = que.top();
            que.pop();
            pols[i] *= pols[j];
            pols[j] = ModVec();

            que.emplace(pols[i].size(), i);
        }

        auto [_, i] = que.top();
        return pols[i];
    }

  private:
    std::vector<modint> v;
};

using ModVec998244353 = ModVec<998244353>;
using ModVec1000000007 = ModVec<1000000007>;

}  // namespace yosupo

#include <random>


namespace yosupo {

struct Xoshiro256StarStar {
  public:
    using result_type = u64;
    Xoshiro256StarStar() : Xoshiro256StarStar(0) {}
    explicit Xoshiro256StarStar(u64 seed) {
        // use splitmix64
        // Reference: http://xoshiro.di.unimi.it/splitmix64.c
        for (int i = 0; i < 4; i++) {
            u64 z = (seed += 0x9e3779b97f4a7c15);
            z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
            z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
            s[i] = z ^ (z >> 31);
        }
    }

    static constexpr result_type min() { return 0; }
    static constexpr result_type max() { return -1; }

    result_type operator()() {
        const u64 result_starstar = rotl(s[1] * 5, 7) * 9;

        const u64 t = s[1] << 17;

        s[2] ^= s[0];
        s[3] ^= s[1];
        s[1] ^= s[2];
        s[0] ^= s[3];

        s[2] ^= t;

        s[3] = rotl(s[3], 45);

        return result_starstar;
    }

  private:
    // Use xoshiro256**
    // Refereces: http://xoshiro.di.unimi.it/xoshiro256starstar.c
    static u64 rotl(const u64 x, int k) { return (x << k) | (x >> (64 - k)); }

    std::array<u64, 4> s;
};

// https://github.com/wangyi-fudan/wyhash
struct WYRand {
  public:
    using result_type = u64;
    explicit WYRand(u64 seed) : s(seed) {}

    static constexpr result_type min() { return 0; }
    static constexpr result_type max() { return -1; }

    result_type operator()() {
        s += 0x2d358dccaa6c78a5;
        auto x = (u128)s * (s ^ 0x8bb84b93962eacc9);
        return (u64)(x ^ (x >> 64));
    }

  private:
    uint64_t s;
};
using Random = WYRand;
inline Random get_random() { return Random(std::random_device()()); }

namespace internal {
inline Random global_gen = get_random();
}
inline Random& global_gen() { return internal::global_gen; }

template <class G>
concept random_64 = std::uniform_random_bit_generator<G> &&
                    std::same_as<u64, std::invoke_result_t<G&>> &&
                    G::min() == u64(0) && G::max() == u64(-1);

namespace internal {

// random choice from [0, upper]
template <random_64 G> u64 uniform_u64(u64 upper, G& gen) {
    if (upper == 0) return 0;
    u64 mask = (std::bit_floor(upper) << 1) - 1;
    while (true) {
        u64 r = gen() & mask;
        if (r <= upper) return r;
    }
}

// random choice from [0, upper], faster than uniform_u64
template <random_64 G> u64 random_u64(u64 upper, G& gen) {
    return (u64)(((u128)(upper) + 1) * gen() >> 64);
}

}  // namespace internal

template <class T, random_64 G> T uniform(T lower, T upper, G& gen) {
    return T(lower + internal::uniform_u64(u64(upper) - u64(lower), gen));
}
template <class T> T uniform(T lower, T upper) {
    return uniform(lower, upper, global_gen());
}

template <std::unsigned_integral T, random_64 G> T uniform(G& gen) {
    return T(gen());
}
template <std::signed_integral T, random_64 G> T uniform(G& gen) {
    return T(gen() + (u64)std::numeric_limits<T>::min());
}
template <class T, random_64 G>
    requires requires {
        { T::mod() } -> std::integral;
    }
T uniform(G& gen) {
    return T(uniform(0, T::mod() - 1, gen));
}
template <class T> T uniform() { return uniform<T>(global_gen()); }

template <class T, random_64 G> T random(T lower, T upper, G& gen) {
    return T(lower + internal::random_u64(u64(upper) - u64(lower), gen));
}
template <class T> T random(T lower, T upper) {
    return random(lower, upper, global_gen());
}

template <random_64 G> bool uniform_bool(G& gen) { return gen() & 1; }
inline bool uniform_bool() { return uniform_bool(global_gen()); }

// select 2 elements from [lower, uppper]
template <class T, random_64 G>
std::pair<T, T> uniform_pair(T lower, T upper, G& gen) {
    assert(upper - lower >= 1);
    T a, b;
    do {
        a = uniform(lower, upper, gen);
        b = uniform(lower, upper, gen);
    } while (a == b);
    if (a > b) std::swap(a, b);
    return {a, b};
}
template <class T> std::pair<T, T> uniform_pair(T lower, T upper) {
    return uniform_pair(lower, upper, global_gen());
}

// random 0.0 <= X < 1.0
template <class G> inline double random_01(G& gen) {
    constexpr double inv = 1.0 / ((double)(u64(1) << 63) * 2);
    return double(gen()) * inv;
}
inline double random_01() { return random_01(global_gen()); }

}  // namespace yosupo


namespace yosupo {

namespace internal {

template <class T> struct CombState {
    int size = 1;
    std::vector<T> fact = {T(1)};
    std::vector<T> inv_fact = {T(1)};
    std::vector<T> inv = {T(0)};

    void extend() {
        fact.resize(2 * size);
        inv_fact.resize(2 * size);
        inv.resize(2 * size);
        for (int i = size; i < 2 * size; i++) {
            fact[i] = fact[i - 1] * T(i);
        }
        inv_fact[2 * size - 1] = fact[2 * size - 1].inv();
        for (int i = 2 * size - 1; i >= size + 1; i--) {
            inv_fact[i - 1] = inv_fact[i] * T(i);
        }

        for (int i = size; i < 2 * size; i++) {
            inv[i] = inv_fact[i] * fact[i - 1];
        }

        size *= 2;
    }
};

template <class T> CombState<T>& get_comb_state(int n) {
    static CombState<T> state;
    while (state.size <= n) state.extend();
    return state;
}

}  // namespace internal

template <class T> T fact(int x) {
    assert(0 <= x);
    return internal::get_comb_state<T>(x).fact[x];
}

template <class T> T inv_fact(int x) {
    assert(0 <= x);
    return internal::get_comb_state<T>(x).inv_fact[x];
}

template <class T> T inv(int x) {
    assert(0 <= x);
    return internal::get_comb_state<T>(x).inv[x];
}

namespace internal {
template <class T> T comb(int n, int k) {
    return fact<T>(n) * inv_fact<T>(k) * inv_fact<T>(n - k);
}
}  // namespace internal

template <class T> T comb(int n, int k) {
    assert(0 <= k);
    if (0 <= n && n < k) return 0;

    if (n >= 0) {
        return internal::comb<T>(n, k);
    }
    T x = internal::comb<T>(k - n - 1, k);
    if (k % 2) x = -x;
    return x;
}

}  // namespace yosupo


#include <bitset>
#include <iostream>

using namespace yosupo;

using std::abs, std::pow, std::sqrt;
using std::array, std::vector, std::string, std::queue, std::deque;
using std::countl_zero, std::countl_one, std::countr_zero, std::countr_one;
using std::istream, std::ostream, std::cerr, std::endl;
using std::min, std::max, std::swap;
using std::pair, std::tuple, std::bitset;
using std::popcount;
using std::priority_queue, std::set, std::multiset, std::map;
using std::views::iota, std::views::reverse;

namespace ranges = std::ranges;
using ranges::sort, ranges::copy_n;

using uint = unsigned int;
using ll = long long;
using ull = unsigned long long;
constexpr ll TEN(int n) { return (n == 0) ? 1 : 10 * TEN(n - 1); }
template <class T> using V = vector<T>;
template <class T> using VV = V<V<T>>;

#ifdef YOSUPO_LOCAL

struct PrettyOS {
    ostream& os;
    bool first;

    template <class T> auto operator<<(T&& x) {
        if (!first) os << ", ";
        first = false;
        os << yosupo::dump(x);
        return *this;
    }
};
template <class... T> void dbg0(T&&... t) {
    (PrettyOS{cerr, true} << ... << t);
}
#define dbg(...)                                            \
    do {                                                    \
        cerr << __LINE__ << " : " << #__VA_ARGS__ << " = "; \
        dbg0(__VA_ARGS__);                                  \
        cerr << endl;                                       \
    } while (false);
#else
#define dbg(...)
#endif


using mint = ModInt998244353;
using mvec = ModVec<998244353>;

Scanner sc = Scanner(stdin);
Printer pr = Printer(stdout);


V<mint> gen0(int h, int s, int t, int turn) {
    const int len = 2 * (h + 1);
    const int rt = h + (h - t);

    const int B = 1000;

    mvec base = {1, 1, 1};    
    mvec base2 = {1};
    V<mvec> bases(B + 1);
    bases[0] = {1};
    for (int i : iota(0, B)) {
        bases[i + 1] = bases[i] * base;
    }

    mvec dp(len);
    dp[s] = 1;

    V<mint> vec(turn + 1);
    int phase_start = 0;
    while (phase_start <= turn) {
        dbg(phase_start);

        for (int step: iota(0, min(turn - phase_start + 1, B))) {
            int phase = phase_start + step;

            int k = int(bases[step].size());
            {
                int p = (t + phase) % len;
                while (p < int(dp.size()) + k) {
                    for (int i : iota(0, min(p + 1, k))) {
                        if (p - i < 0) continue;
                        if (p - i >= int(dp.size())) continue;
                        vec[phase] += dp[p - i] * bases[step][i];
                    }
                    p += len;
                }
            }
            {
                int p = (rt + phase) % len;
                while (p < int(dp.size()) + k) {
                    for (int i : iota(0, min(p + 1, k))) {
                        if (p - i < 0) continue;
                        if (p - i >= int(dp.size())) continue;

                        vec[phase] -= dp[p - i] * bases[step][i];
                    }
                    p += len;
                }
            }
        }

        dp *= bases[B];
        while (int(dp.size()) > len) {
            mint x = dp.back();
            dp[(int(dp.size()) - 1) % len] += x;
            dp.pop_back();
        }
        phase_start += B;
    }

    return vec;
}

// O(HT)
V<mint> gen1(int h, int s, int t, int turn) {
    V<mint> dp(h);
    dp[s] = 1;

    V<mint> vec(turn + 1);
    for (int phase : iota(0, turn + 1)) {
        vec[phase] = dp[t];
        V<mint> ndp = dp;
        for (int i : iota(0, h)) {
            if (i > 0) ndp[i - 1] += dp[i];
            if (i < h - 1) ndp[i + 1] += dp[i];
        }
        dp = ndp;
    }
    return vec;
}

V<mint> gen(int h, int s, int t, int turn) {
    if (h >= 3000) {
        // For large h, use the faster gen0
        return gen0(h, s, t, turn);
    } else {
        // For small h, use the more straightforward gen1
        return gen1(h, s, t, turn);
    }
}

void test() {
    for (int _ : iota(0, 1000)) {
        int h = uniform(2, 100);
        int s = uniform(0, h - 1);
        int t = uniform(0, h - 1);
        int turn = uniform(0, 100);
        auto v0 = gen0(h, s, t, turn);
        auto v1 = gen1(h, s, t, turn);
        assert(v0 == v1);
    }
}

int main() {
    //test();
    //return 0;

    int h, w, turn, a, b, c, d;
    sc.read(h, w, turn, a, b, c, d);
    a--; b--; c--; d--;


    auto v0 = gen(h, a, c, turn);
    auto v1 = gen(w, b, d, turn);

    //dbg(v0, v1);

    auto v = V<mint>(turn + 1);
    for (int i : iota(0, turn + 1)) {
        v[i] = v0[i] * v1[i];
    }

    // for (int t : iota(0, turn + 1)) {
    //     for (int i : iota(0, t)) {
    //         v[t] -= v[i] * comb<mint>(t, i);
    //     }
    // }
    // mint ans = v[turn];

    mint result = 0;
    for (int k = 0; k <= turn; ++k) {
        mint term = comb<mint>(turn, k) * v[k];
        if ((turn - k) % 2 == 1) {
            result -= term;
        } else {
            result += term;
        }
    }
    mint ans = result;

    pr.writeln(ans.val());
    return 0;
}

Submission Info

Submission Time
Task D - King
User yosupo
Language C++ 23 (gcc 12.2)
Score 1000
Code Size 59199 Byte
Status AC
Exec Time 13770 ms
Memory 26040 KiB

Compile Error

Main.cpp: In function ‘void test()’:
Main.cpp:1992:14: warning: unused variable ‘_’ [-Wunused-variable]
 1992 |     for (int _ : iota(0, 1000)) {
      |              ^

Judge Result

Set Name Sample All
Score / Max Score 0 / 0 1000 / 1000
Status
AC × 2
AC × 18
Set Name Test Cases
Sample 00_sample_00.txt, 00_sample_01.txt
All 00_sample_00.txt, 00_sample_01.txt, 01_random_00.txt, 01_random_01.txt, 01_random_02.txt, 01_random_03.txt, 01_random_04.txt, 01_random_05.txt, 02_max_00.txt, 02_max_01.txt, 02_max_02.txt, 03_corner_00.txt, 03_corner_01.txt, 04_sqrt_00.txt, 04_sqrt_01.txt, 04_sqrt_02.txt, 04_sqrt_03.txt, 04_sqrt_04.txt
Case Name Status Exec Time Memory
00_sample_00.txt AC 1 ms 3468 KiB
00_sample_01.txt AC 1 ms 3380 KiB
01_random_00.txt AC 6819 ms 18804 KiB
01_random_01.txt AC 8150 ms 18364 KiB
01_random_02.txt AC 3245 ms 13816 KiB
01_random_03.txt AC 4156 ms 14800 KiB
01_random_04.txt AC 27 ms 13704 KiB
01_random_05.txt AC 31 ms 13792 KiB
02_max_00.txt AC 13719 ms 26024 KiB
02_max_01.txt AC 13701 ms 26040 KiB
02_max_02.txt AC 13770 ms 25936 KiB
03_corner_00.txt AC 4108 ms 20428 KiB
03_corner_01.txt AC 8372 ms 24536 KiB
04_sqrt_00.txt AC 149 ms 13768 KiB
04_sqrt_01.txt AC 295 ms 13668 KiB
04_sqrt_02.txt AC 555 ms 13792 KiB
04_sqrt_03.txt AC 1136 ms 13792 KiB
04_sqrt_04.txt AC 2370 ms 13688 KiB