#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern "C" { #include "mmio.h" } #include #include #include #include "mlir/ExecutionEngine/SparseTensor/COO.h" #include "mlir/ExecutionEngine/SparseTensor/Storage.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" #include "llvm/Support/FormatVariadic.h" #if DISTRIBUTED #include "parttensor_mpi_backend/Storage.h" #include "parttensor_mpi_backend/parttensor_mpi_backend.h" #include template using PartTensorStorage = mlir::parttensor_mpi::MPITensorStorage; using mlir::parttensor_mpi::Product; using mlir::parttensor_mpi::SplitLoHiPoint; using mlir::parttensor_mpi::SubtractPoints; #endif // DISTRIBUTED using namespace mlir::sparse_tensor; using std::begin; using std::end; using std::make_unique; using std::span; using std::unique_ptr; using std::vector; using index_t = uint64_t; void getRss() { std::ifstream file("/proc/self/status"); std::string line; #if DISTRIBUTED const auto rank = _mlir_ciface_mpi_getRank(); fmt::print("{}: ", rank); #endif while (std::getline(file, line)) { if (line.starts_with("VmRSS:")) { line = line.substr(7); std::istringstream iss(line); long rss{}; iss >> rss; fmt::print("{} GB\n", double(rss) / pow(2, 20)); std::string rest; iss >> rest; assert(rest == "kB"); break; } } } bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, float abs_th = FLT_MIN) // those defaults are arbitrary and could be removed { assert(std::numeric_limits::epsilon() <= epsilon); assert(epsilon < 1.f); if (a == b) return true; auto diff = std::abs(a - b); auto norm = std::min((std::abs(a) + std::abs(b)), std::numeric_limits::max()); // or even faster: std::min(std::abs(a + b), // std::numeric_limits::max()); keeping this commented out until I // update figures below return diff < std::max(abs_th, epsilon * norm); } // copied from: https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon template std::enable_if_t::is_integer, bool> equal_within_ulps(T x, T y, std::size_t n) { // Since `epsilon()` is the gap size (ULP, unit in the last place) // of floating-point numbers in interval [1, 2), we can scale it to // the gap size in interval [2^e, 2^{e+1}), where `e` is the exponent // of `x` and `y`. // If `x` and `y` have different gap sizes (which means they have // different exponents), we take the smaller one. Taking the bigger // one is also reasonable, I guess. const T m = std::min(std::fabs(x), std::fabs(y)); // Subnormal numbers have fixed exponent, which is `min_exponent - 1`. const int exp = m < std::numeric_limits::min() ? std::numeric_limits::min_exponent - 1 : std::ilogb(m); // We consider `x` and `y` equal if the difference between them is // within `n` ULPs. return std::abs(x - y) <= n * std::ldexp(std::numeric_limits::epsilon(), exp); } using index_t = uint64_t; template struct memref_t { uint64_t deadbeef; T *dataptr; uint64_t offset; uint64_t sizes[dims]; uint64_t strides[dims]; }; using memref_1d_i64 = memref_t; using memref_2d_f32 = memref_t; using memref_3d_f32 = memref_t; template T multiply(span sizes) { return std::accumulate(begin(sizes), end(sizes), size_t(1), std::multiplies()); } template void print_memref(memref_t out) { fmt::print("sizes: {}\n", out.sizes); fmt::print("strides: {}\n", out.strides); fmt::print("values: {}\n", span(out.dataptr, multiply(span(out.sizes, dims)))); } extern "C" { extern void *part_tensor_softmax(void *A, void *out_file_name); extern void printSeperator(); extern uint64_t _mlir_ciface_mpi_getRank(); extern void delSparseTensor(void *tensor); } void printSeperator() { printf("---------------------------------------------------------------------" "-----------\n"); } auto init_bin_matrix_sizes(const std::string &infile, unsigned &M, unsigned &N, unsigned &nnz) { auto base_name = infile.substr(0, infile.find_last_of('.')); auto info_file = base_name + ".info"; auto file = fopen(info_file.c_str(), "r"); if (!file) { std::cout << "Not valid file\n"; exit(1); } int M_, N_, nnz_; auto nfieldsRead = fscanf(file, "%d %d %d", &M_, &N_, &nnz_); M = M_; N = N_; nnz = nnz_; // fmt::print("M: {} N: {} nnz: {}\n", M, N, nnz); if (nfieldsRead != 3) { std::cout << "Could not read mm size\n"; exit(1); } fclose(file); } // std::tuple>> // init_matrix_a(std::string infile) { template auto init_bin_matrix_a(const std::string &infile, unsigned &M, unsigned &N, unsigned &nnz, unsigned Nf, FilterTy &&f, bool overrideValues = false) { std::mt19937 gen; gen.seed(0); std::uniform_real_distribution<> dis(0.0, 1.0); init_bin_matrix_sizes(infile, M, N, nnz); auto file = fopen(infile.c_str(), "rb"); std::ifstream edgeData( (infile.substr(0, infile.find_first_of('.')) + ".edge.data.bin").c_str(), std::ios::binary); auto stCooA = std::make_unique>( std::vector({index_t(M), index_t(N), index_t(Nf)})); std::vector coords(3); auto statusUpdate = 0; std::vector vals(Nf); for (auto i : llvm::seq(0u, nnz)) { if (statusUpdate++ == 10000000) { fmt::print("."); statusUpdate = 0; } // The coordinates are 2d but tensor is 3d auto numRead = fread(coords.data(), sizeof(decltype(coords)::value_type), std::size(coords) - 1, file); assert(numRead == std::size(coords) && "Read error"); const auto numBytesToRead = sizeof(decltype(vals)::value_type) * Nf; if (f(coords[0])) { edgeData.read(reinterpret_cast(vals.data()), numBytesToRead); for (auto j : llvm::seq(0u, Nf)) { coords.back() = j; stCooA->add(coords, vals[j]); } } else { edgeData.seekg(numBytesToRead, std::ios_base::cur); } } fmt::print("\n"); assert(!(ferror(file) || feof(file)) && "Input file too short"); auto numRead = fread(coords.data(), sizeof(decltype(coords)::value_type), std::size(coords), file); assert(numRead == 0 && "Input file too long"); fclose(file); return stCooA; } auto init_matrix_a(const std::string &infile, unsigned &M, unsigned &N, unsigned &nnz, bool overrideValues = false) { std::mt19937 gen; gen.seed(0); std::uniform_real_distribution<> dis(0.0, 1.0); MM_typecode mc = {}; auto file = fopen(infile.c_str(), "r"); auto retCode = mm_read_banner(file, (MM_typecode *)&mc); if (retCode != 0) { std::cout << "Not valid file\n"; exit(1); } int M_, N_, nnz_; retCode = mm_read_mtx_crd_size(file, &M_, &N_, &nnz_); M = M_; N = N_; nnz = nnz_; if (retCode != 0) { std::cout << "Could not read mm size\n"; exit(1); } auto stCooA = std::make_unique>( std::vector({index_t(M), index_t(N)})); const bool IsSymmetric = mm_is_symmetric(mc); const bool IsPattern = mm_is_pattern(mc); if (IsPattern) overrideValues = true; for (auto i : llvm::seq(0, nnz_)) { index_t r, c; float v; if (IsPattern) fscanf(file, "%ld %ld", &r, &c); else fscanf(file, "%ld %ld %f", &r, &c, &v); v = overrideValues ? dis(gen) : v; r--; c--; // mtx is 1 based stCooA->add({r, c}, v); if (IsSymmetric && r != c) { stCooA->add({c, r}, v); } } return stCooA; } template void init_feats(SparseTensorStorage *tensor, const std::string &infile, unsigned N, unsigned Dh, unsigned Nh, unsigned beginOffset = 0, bool ValidateFileSizes = false, bool overrideValues = false) { std::mt19937 gen; gen.seed(0); std::uniform_real_distribution<> dis(0.0, 1.0); std::ifstream file( (infile.substr(0, infile.find_first_of('.')) + ".vert.data.bin").c_str(), std::ios::binary); if (!file) { std::cout << "Not valid file\n"; exit(1); } std::streampos fileBegin = file.tellg(); // Get the current position (which is the file size) file.seekg(0, std::ios::end); // Seek to the end of the file size_t fileSize = file.tellg() - fileBegin; file.seekg(beginOffset, std::ios::beg); // Seek back to the base offset assert(!ValidateFileSizes || fileSize == N * Dh * Nh * sizeof(T) && "File size mismatch"); std::vector *vals; tensor->getValues(&vals); file.read(reinterpret_cast(vals->data()), sizeof(T) * vals->size()); // ValidateFileSizes -> file.good() assert((!ValidateFileSizes || file.good()) && "File read error"); return; } template auto init_feats(const std::string &infile, unsigned &N, unsigned Dh, unsigned Nh, unsigned beginOffset = 0, bool overrideValues = false) { std::mt19937 gen; gen.seed(0); std::uniform_real_distribution<> dis(0.0, 1.0); std::ifstream file(infile, std::ios::binary); if (!file) { std::cout << "Not valid file\n"; exit(1); } std::streampos fileBegin = file.tellg(); // Get the current position (which is the file size) file.seekg(0, std::ios::end); // Seek to the end of the file size_t fileSize = file.tellg() - fileBegin; file.seekg(beginOffset, std::ios::beg); // Seek back to the base offset assert(fileSize == N * Dh * Nh * sizeof(T) && "File size mismatch"); auto stCooA = std::make_unique>( std::vector({index_t(N), index_t(Dh), index_t(Nh)})); for (auto n : llvm::seq(0u, N)) for (auto dh : llvm::seq(0u, Dh)) for (auto nh : llvm::seq(0u, Nh)) { T val; file.read(reinterpret_cast(&val), sizeof(T)); val = overrideValues ? dis(gen) : val; stCooA->add({n, dh, nh}, val); assert(file.good() && "File read error"); } return stCooA; } std::unique_ptr> init_matrix_a(index_t rowSize = 4) { assert(0 && "Need to init with multiple features!"); auto dims = std::vector{rowSize, rowSize}; auto stCooA = std::make_unique>(dims); for (auto i : llvm::seq(0ul, rowSize)) for (auto j : llvm::seq(0ul, rowSize)) stCooA->add({i, j}, float(i * rowSize + j)); return stCooA; } template auto init_random(SparseTensorCOO &vec, size_t N, size_t seed = 0) { std::mt19937 gen; gen.seed(seed); std::uniform_real_distribution<> dis(T(0), T(100)); auto vals = vec.getElements(); for (auto i : llvm::seq(0ul, N)) (vals.at(i)).value = (dis(gen)); } template auto init_random(std::vector &vec, size_t N, size_t seed = 0) { std::mt19937 gen; gen.seed(seed); std::uniform_real_distribution<> dis(T(0), T(100)); for (auto i : llvm::seq(0ul, N)) vec.push_back(dis(gen)); } template <> auto init_random(std::vector &vec, size_t N, size_t seed) { std::mt19937 gen; gen.seed(seed); std::bernoulli_distribution dis(.5); for (auto i : llvm::seq(0ul, N)) vec.push_back(dis(gen)); } /// Initialize a 2D sparse square tensor with random values /// @param vec: SparseTensorCOO to be initialized /// @param N: rows and columns of the 2D tensor /// @param seed: seed for random number generator /// @param density: density of the tensor template auto init_random_sparse_2d(SparseTensorCOO &stCoo, size_t N, size_t seed, double density = 0.5) { std::mt19937 gen; gen.seed(seed); std::bernoulli_distribution dis(density); for (auto i : llvm::seq(0ul, N * N)) if (dis(gen)) stCoo.add({i / N, i % N}, T{1}); } template void write_tensor(span vec, std::string filename) { std::ofstream(filename, std::ios::binary) .write(reinterpret_cast(vec.data()), sizeof(T) * vec.size()); } template void write_vector_to_stcoo3d(const std::vector &vec, SparseTensorCOO &stCoo) { auto dims = stCoo.getDimSizes(); auto [N, Dh, Nh] = std::tuple(dims[0], dims[1], dims[2]); llvm::for_each(llvm::enumerate(vec), [&stCoo, N, Dh, Nh](auto pair) { auto [index, val] = pair; auto nh = index % Nh; auto rest = (index - nh) / Nh; auto dh = rest % Dh; auto n = (rest - dh) / Dh; stCoo.add({n, dh, nh}, val); }); } #if DISTRIBUTED auto get2DPartition(size_t rows, size_t cols, size_t nh, size_t rowParts, size_t colParts) { std::vector partitionPlan; auto rowPartitionSize = rows / rowParts; auto colPartitionSize = cols / colParts; assert(rows % rowParts == 0 && "rows % rowParts != 0"); assert(cols % colParts == 0 && "cols % colParts != 0"); for (int j = 0; j < cols; j += colPartitionSize) { for (int i = 0; i < rows; i += rowPartitionSize) { if (false && _mlir_ciface_mpi_getRank() == 0) std::cout << "(" << i << "," << j << ") -> (" << i + rowPartitionSize << "," << j + colPartitionSize << ") \n"; partitionPlan.push_back(index_t(i)); partitionPlan.push_back(index_t(j)); if (nh) partitionPlan.push_back(index_t(0)); partitionPlan.push_back(index_t(i + rowPartitionSize)); partitionPlan.push_back(index_t(j + colPartitionSize)); if (nh) partitionPlan.push_back(index_t(nh)); } } return partitionPlan; } #endif #define VEC_TO_MEMREF2D(v, size_0, size_1) \ (void *)0xdeadbeef, v.data(), 0, size_0, size_1, size_1, 1 #define VEC_TO_MEMREF3D(v, size_0, size_1, size_2) \ (void *)0xdeadbeef, v.data(), 0, size_0, size_1, size_2, size_2 *size_1, \ size_2, 1 #define TENSOR_2D_ARG(t) \ void *t##_deadbeef, void *t##_dataptr, uint64_t t##_offset, \ uint64_t t##_sizes_0, uint64_t t##_sizes_1, uint64_t t##_strides_0, \ uint64_t t##_strides_1 #define TENSOR_3D_ARG(t) \ void *t##_deadbeef, void *t##_dataptr, uint64_t t##_offset, \ uint64_t t##_sizes_0, uint64_t t##_sizes_1, uint64_t t##_sizes_2, \ uint64_t t##_strides_0, uint64_t t##_strides_1, uint64_t t##_strides_2 extern "C" void *sparse_mha(void *A, void *Q, void *K, void *V, void *); extern "C" void *pte_local_bsddmm(void *A, void *Q, void *K); extern "C" void *pte_local_bspmm(void *A, void *V); extern "C" void *pte_bsddmm(void *A, void *Q, void *K, index_t n1, index_t n2, index_t dh, index_t nh); extern "C" void *pte_local_sparse_mha(void *A, void *Q, void *K, void *V); extern "C" void *pte_sparse_mha(void *A, void *Q, void *K, void *V, index_t n1, index_t n2, index_t dh, index_t nh); extern "C" void lapis_initialize(); extern "C" void lapis_finalize(); void validate_cli(const argparse::ArgumentParser &p) { if (p.is_used("-n") && p.is_used("-i")) { fmt::print("Only one of -n OR -i can be specified\n"); exit(1); } if (p.is_used("-d") && p.is_used("-i")) { fmt::print("Only one of -d OR -i can be specified\n"); exit(1); } if (!p.is_used("--check") && !p.is_used("--ntimes")) { fmt::print("Atleast --check or --ntimes should be specified\n"); exit(1); } } int main(int argc, char **argv) { using namespace mlir::sparse_tensor; #if DISTRIBUTED MPI_Init(&argc, &argv); #endif #if defined(USE_KOKKOS) lapis_initialize(); #endif unsigned N = 4, Nh = 2, Dh = 2, Nnz{}, nTimes = 1; unsigned Nparts = 1; unsigned logRank = 99; // bool PerfOnly = false, LocalOnly = false, CheckCorrectness = false; bool LocalOnly = false, DistOnly = false, CheckCorrectness = false, ValidateFileSizes = false; double density = 0.5; std::string infile; argparse::ArgumentParser program(argv[0]); program.add_argument("-dh").store_into(Dh).help("head size, default = 2"); program.add_argument("-nh").store_into(Nh).help("#head, default = 2"); program.add_argument("-n").store_into(N).help("#nodes, default = 4"); program.add_argument("-i").store_into(infile).help("Input File"); program.add_argument("--ntimes").store_into(nTimes).help("rerun n times = 1"); program.add_argument("-d", "--density") .store_into(density) .help("sparsity density, default = 0.5"); program.add_argument("-np", "--nparts") .store_into(Nparts) .help("number of parts, default = 1"); program.add_argument("--logrank") .store_into(logRank) .help("log when rank == logrank, default = 99"); program.add_argument("--check") .store_into(CheckCorrectness) .help("Do correctness check, default = false") .flag(); program.add_argument("--validate-file-sizes") .store_into(ValidateFileSizes) .help("Make sure Nh and Dh are compatible with the input file sizes, " "default = false") .flag(); program.add_argument("--dist-only") .store_into(DistOnly) .help("Skip local run, default = false") .flag(); program.add_argument("--local-only") .store_into(LocalOnly) .help("Skip distributed run, default = false") .flag(); try { program.parse_args(argc, argv); } catch (const std::exception &err) { std::cerr << err.what() << std::endl; std::cerr << program; return 1; } validate_cli(program); decltype(init_matrix_a(4)) stCooA{}, stCooQ{}; #if DISTRIBUTED const auto rank = _mlir_ciface_mpi_getRank(); #endif // DISTRIBUTED if (!infile.empty()) { decltype(N) M; if (DistOnly) { #if DISTRIBUTED if (!infile.ends_with(".bin")) { fmt::print("Only binary input supported for distributed run\n"); exit(1); } init_bin_matrix_sizes(infile, M, N, Nnz); auto aPartitionPlan = get2DPartition(N, N, 0, Nparts, 1); const std::vector dims = {N, N}; auto myaPartSpec = std::span(aPartitionPlan) .subspan(rank * std::size(dims) * 2, std::size(dims) * 2); auto [aaLo, aaHi] = SplitLoHiPoint( llvm::ArrayRef(myaPartSpec.data(), std::size(myaPartSpec))); // fmt::println("Rank: {} aaHi: {} aaLo: {}", rank, aaHi, aaLo); stCooA = init_bin_matrix_a(infile, M, N, Nnz, Dh * Nh, [=](auto n1) { return (aaLo[0] <= n1 && n1 < aaHi[0]); }); #endif // DISTRIBUTED } else { stCooA = infile.ends_with(".bin") ? init_bin_matrix_a(infile, M, N, Nnz, Nh, [](auto) { return true; }) : init_matrix_a(infile, M, N, Nnz); } assert(M == N); } else { assert(0); } fmt::println(">>> Loaded SparseMat"); fflush(nullptr); system("date"); assert(N % Nparts == 0 && "Problem size (n) should divisible by nparts"); const size_t TensorSize = N * Dh * Nh, MatSize = N * N; const std::vector dims = {N, N, Nh}; const std::vector featDims = {N, Dh, Nh}; const bool AllocateWholeTensors = CheckCorrectness || (!DistOnly); if (infile.empty()) { stCooA = make_unique>(dims); init_random_sparse_2d(*stCooA, N, 1, density); } stCooQ = make_unique>(featDims); const LevelType denseLvl = *mlir::sparse_tensor::buildLevelType(LevelFormat::Dense, {}); const LevelType compressedLvl = *mlir::sparse_tensor::buildLevelType( LevelFormat::Compressed, false, false); const LevelType kCSR[] = {denseLvl, compressedLvl}; const LevelType kCSRV[] = {denseLvl, compressedLvl, denseLvl}; const LevelType kDenseV[] = {denseLvl, denseLvl, denseLvl}; const uint64_t src2tgt[] = {0, 1, 2}; auto stA = AllocateWholeTensors ? SparseTensorStorage::newFromCOO( std::size(dims), dims.data(), std::size(dims), dims.data(), kCSRV, src2tgt, src2tgt, stCooA.get()) : nullptr; getRss(); if (LocalOnly) stCooA.reset(); getRss(); fmt::println(">>> Allocated A"); fflush(nullptr); system("date"); auto stQ = AllocateWholeTensors ? SparseTensorStorage::newFromCOO( std::size(featDims), featDims.data(), std::size(featDims), featDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get()) : nullptr; if (stQ) init_feats(stQ, infile, N, Dh, Nh); if (stQ) { fmt::println(">>> Allocated Q"); fflush(nullptr); system("date"); } auto stK = AllocateWholeTensors ? SparseTensorStorage::newFromCOO( std::size(featDims), featDims.data(), std::size(featDims), featDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get()) : nullptr; if (stK) init_feats(stK, infile, N, Dh, Nh); if (stK) { fmt::println(">>> Allocated K"); fflush(nullptr); system("date"); } auto stV = AllocateWholeTensors ? SparseTensorStorage::newFromCOO( std::size(featDims), featDims.data(), std::size(featDims), featDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get()) : nullptr; if (stV) init_feats(stV, infile, N, Dh, Nh); if (stV) { fmt::println(">>> Allocated V"); fflush(nullptr); system("date"); } stCooQ.reset(); std::vector localTimes; auto start = std::chrono::high_resolution_clock::now(); auto gold_out = AllocateWholeTensors ? static_cast *>( pte_local_bspmm(stA, stV)) : nullptr; auto end = std::chrono::high_resolution_clock::now(); localTimes.push_back( std::chrono::duration_cast(end - start)); if (!CheckCorrectness && gold_out) { std::vector *o; gold_out->getValues(&o); auto outfile = infile.substr(infile.find_last_of('/') + 1); outfile = fmt::format("{}.res", outfile.substr(0, outfile.find_first_of('.'))); write_tensor(*o, outfile.c_str()); delSparseTensor((void *)gold_out); } if (AllocateWholeTensors) { fmt::println(">>> Done running local kernel"); fflush(nullptr); system("date"); getRss(); } for (auto i : llvm::seq(0u, !AllocateWholeTensors ? 0u : nTimes)) { auto start = std::chrono::high_resolution_clock::now(); auto out = static_cast *>( pte_local_bspmm(stA, stV)); auto end = std::chrono::high_resolution_clock::now(); localTimes.push_back( std::chrono::duration_cast(end - start)); delSparseTensor((void *)out); } // print average time if (AllocateWholeTensors) fmt::print("Local time: {}ms\n", float(std::accumulate(localTimes.begin(), localTimes.end(), std::chrono::milliseconds(0)) .count()) / localTimes.size()); delSparseTensor((void *)stA); delSparseTensor((void *)stQ); delSparseTensor((void *)stK); delSparseTensor((void *)stV); if (LocalOnly) { #if defined(USE_KOKKOS) lapis_finalize(); #endif #if DISTRIBUTED MPI_Finalize(); #endif return 0; } #if DISTRIBUTED auto aPartitionPlan = get2DPartition(N, N, 0, Nparts, 1); auto ptA = PartTensorStorage::newFromCOO( std::size(aPartitionPlan), aPartitionPlan.data(), std::size(dims), dims.data(), kCSR, stCooA.get()); stCooA.reset(); fmt::println(">>> {}: allocated ptA", rank); fflush(nullptr); system("date"); getRss(); auto qPartitionPlan = get2DPartition(N, Dh, Nh, Nparts, 1); auto myPartSpec = std::span(qPartitionPlan) .subspan(rank * std::size(featDims) * 2, std::size(featDims) * 2); auto fileOffset = myPartSpec[0] * Product(featDims, 1) * sizeof(float); auto partFeatDims = std::vector(3); auto [aLo, aHi] = SplitLoHiPoint(llvm::ArrayRef(myPartSpec.data(), std::size(myPartSpec))); // fmt::print("Rank: {} aHi: {} aLo: {}\n", rank, aHi, aLo); SubtractPoints(partFeatDims, aHi, aLo); // fmt::print("Rank: {} partSpec: {} fileOffset: {} partFeatDims: {}\n", rank, // myPartSpec, fileOffset, partFeatDims); auto stPartQ = SparseTensorStorage::newFromCOO( std::size(partFeatDims), partFeatDims.data(), std::size(partFeatDims), partFeatDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get()); fmt::println(">>> {}: allocated ptQ", rank); fflush(nullptr); system("date"); getRss(); auto stPartK = SparseTensorStorage::newFromCOO( std::size(partFeatDims), partFeatDims.data(), std::size(partFeatDims), partFeatDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get()); auto stPartV = SparseTensorStorage::newFromCOO( std::size(partFeatDims), partFeatDims.data(), std::size(partFeatDims), partFeatDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get()); init_feats(stPartQ, infile, partFeatDims[0], partFeatDims[1], partFeatDims[2], fileOffset); init_feats(stPartK, infile, partFeatDims[0], partFeatDims[1], partFeatDims[2], fileOffset); init_feats(stPartV, infile, partFeatDims[0], partFeatDims[1], partFeatDims[2], fileOffset); auto ptQ = PartTensorStorage::newFromSparseTensorStorage( std::size(qPartitionPlan), qPartitionPlan.data(), std::size(featDims), featDims.data(), stPartQ); auto ptK = PartTensorStorage::newFromSparseTensorStorage( std::size(qPartitionPlan), qPartitionPlan.data(), std::size(featDims), featDims.data(), stPartK); auto ptV = PartTensorStorage::newFromSparseTensorStorage( std::size(qPartitionPlan), qPartitionPlan.data(), std::size(featDims), featDims.data(), stPartV); fmt::println(">>> {}: allocated ptV", rank); fflush(nullptr); system("date"); getRss(); localTimes.clear(); start = std::chrono::high_resolution_clock::now(); auto out = static_cast *>( pte_sparse_mha(ptA, ptQ, ptK, ptV, N, N, Dh, Nh)); end = std::chrono::high_resolution_clock::now(); localTimes.push_back( std::chrono::duration_cast(end - start)); fmt::println(">>> {}: Done dist exec", rank); fflush(nullptr); system("date"); getRss(); if (!CheckCorrectness) { std::vector *o; out->getValues(&o); int comm_size; // get comm size MPI_Comm_size(MPI_COMM_WORLD, &comm_size); auto outfile = infile.substr(infile.find_last_of('/') + 1); outfile = fmt::format("{}.{}.{}.res", outfile.substr(0, outfile.find_first_of('.')), rank, comm_size); write_tensor(*o, outfile.c_str()); delSparseTensor((void *)out); } for (auto i : llvm::seq(0u, CheckCorrectness ? 0u : nTimes)) { auto start = std::chrono::high_resolution_clock::now(); auto out = static_cast *>( pte_sparse_mha(ptA, ptQ, ptK, ptV, N, N, Dh, Nh)); auto end = std::chrono::high_resolution_clock::now(); localTimes.push_back( std::chrono::duration_cast(end - start)); delSparseTensor((void *)out); } // print average time fmt::print("Dist time: {}ms\n", float(std::accumulate(localTimes.begin(), localTimes.end(), std::chrono::milliseconds(0)) .count()) / localTimes.size()); if (CheckCorrectness) { std::vector *o, *g; out->getValues(&o); gold_out->getValues(&g); size_t partSize = (N * Dh * Nh / Nparts); auto partBegin = partSize * _mlir_ciface_mpi_getRank(); auto partEnd = partBegin + partSize; for (auto i : llvm::seq(partBegin, partEnd)) { auto localI = i - partBegin; if (g->at(i) != o->at(localI)) { fmt::print("Mismatch at rank {} index: {} expected: {} got: {}\n", rank, i, g->at(i), o->at(localI)); return 1; } } } // fmt::print("Q tensor: {}\n", Q); // fmt::print("K tensor: {}\n", K); // fmt::print("V tensor: {}\n", V); #define PRINT_OUTPUT(out) \ { \ std::vector *o; \ out->getValues(&o); \ fmt::print(#out " tensor: {}\n", *o); \ write_tensor(*o, #out ".dat"); \ } // PRINT_OUTPUT(out) // PRINT_OUTPUT(gold_out) #undef PRINT_OUTPUT #if defined(USE_KOKKOS) lapis_finalize(); #endif MPI_Finalize(); #endif return 0; }