Skip to content

Instantly share code, notes, and snippets.

@freddiev4
Created August 16, 2024 03:06
Show Gist options
  • Save freddiev4/1a407d33abf392664d33e27ad04549ea to your computer and use it in GitHub Desktop.
Save freddiev4/1a407d33abf392664d33e27ad04549ea to your computer and use it in GitHub Desktop.
{"problem_type": "dense_la", "language": "cpp", "name": "00_dense_la_lu_decomp", "parallelism_model": "cuda", "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n Store the results for L and U into the original matrix A. \n A is an NxN matrix stored in row-major.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n input: [[4, 3], [6, 3]]\n output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {"}
{"problem_type": "dense_la", "language": "cpp", "name": "01_dense_la_solve", "parallelism_model": "cuda", "prompt": "/* Solve the linear system Ax=b for x.\n A is an NxN matrix in row-major. x and b have N elements.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n \n input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {"}
{"problem_type": "dense_la", "language": "cpp", "name": "02_dense_la_gemm", "parallelism_model": "cuda", "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n Example:\n\n input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {"}
{"problem_type": "dense_la", "language": "cpp", "name": "03_dense_la_axpy", "parallelism_model": "cuda", "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n \n input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {"}
{"problem_type": "dense_la", "language": "cpp", "name": "04_dense_la_gemv", "parallelism_model": "cuda", "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n Example:\n\n input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {"}
{"problem_type": "fft", "language": "cpp", "name": "05_fft_inverse_fft", "parallelism_model": "cuda", "prompt": "/* Compute the inverse fourier transform of x in-place.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n \n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {"}
{"problem_type": "fft", "language": "cpp", "name": "06_fft_dft", "parallelism_model": "cuda", "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [1, 4, 9, 16]\n output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {"}
{"problem_type": "fft", "language": "cpp", "name": "07_fft_fft_conjugate", "parallelism_model": "cuda", "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {"}
{"problem_type": "fft", "language": "cpp", "name": "08_fft_split_fft", "parallelism_model": "cuda", "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {"}
{"problem_type": "fft", "language": "cpp", "name": "09_fft_fft_out_of_place", "parallelism_model": "cuda", "prompt": "/* Compute the fourier transform of x. Store the result in output.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {"}
{"problem_type": "geometry", "language": "cpp", "name": "10_geometry_convex_hull", "parallelism_model": "cuda", "prompt": "struct Point {\n double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n Example:\n\n input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {"}
{"problem_type": "geometry", "language": "cpp", "name": "11_geometry_convex_hull_perimeter", "parallelism_model": "cuda", "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n Store the result in perimeter.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n Example:\n\n input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {"}
{"problem_type": "geometry", "language": "cpp", "name": "12_geometry_smallest_triangle", "parallelism_model": "cuda", "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n Return the result in area.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {"}
{"problem_type": "geometry", "language": "cpp", "name": "13_geometry_closest_pair_2d", "parallelism_model": "cuda", "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n Store the result in distance.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n Example: \n\n input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {"}
{"problem_type": "geometry", "language": "cpp", "name": "14_geometry_closest_pair_1d", "parallelism_model": "cuda", "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n Store the result in distance.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [7, 3, 9, 12, 31, 1]\n output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {"}
{"problem_type": "graph", "language": "cpp", "name": "15_graph_edge_count", "parallelism_model": "cuda", "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n Store the result in numEdges. A represents a directed graph.\n A is an NxN adjacency matrix stored in row-major.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {"}
{"problem_type": "graph", "language": "cpp", "name": "16_graph_largest_component", "parallelism_model": "cuda", "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n Store the result in largestComponentSize.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {"}
{"problem_type": "graph", "language": "cpp", "name": "17_graph_highest_degree", "parallelism_model": "cuda", "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n Store the result in maxDegree.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {"}
{"problem_type": "graph", "language": "cpp", "name": "18_graph_count_components", "parallelism_model": "cuda", "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {"}
{"problem_type": "graph", "language": "cpp", "name": "19_graph_shortest_path", "parallelism_model": "cuda", "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n Store the result in pathLength.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {"}
{"problem_type": "histogram", "language": "cpp", "name": "20_histogram_pixel_histogram", "parallelism_model": "cuda", "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n The vector `image` is a grayscale image with values 0-255.\n Store the results in `bins`.\n Use CUDA to count in parallel. The kernel is launched with at least N threads.\n Example:\n \n input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {"}
{"problem_type": "histogram", "language": "cpp", "name": "21_histogram_bin_0-100", "parallelism_model": "cuda", "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n Example:\n\n input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {"}
{"problem_type": "histogram", "language": "cpp", "name": "22_histogram_count_ants", "parallelism_model": "cuda", "prompt": "struct Point {\n double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n Store the counts in `bins`.\n Use CUDA to count in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {"}
{"problem_type": "histogram", "language": "cpp", "name": "23_histogram_first_letter_counts", "parallelism_model": "cuda", "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n Assume all strings are in lower case. Store the output in `bins` array.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {"}
{"problem_type": "histogram", "language": "cpp", "name": "24_histogram_count_quartile", "parallelism_model": "cuda", "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Examples:\n\n input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n output: [2, 1, 2, 2]\n\n input: [1.9, 0.2, 0.6, 10.1, 7.4]\n output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {"}
{"problem_type": "reduce", "language": "cpp", "name": "25_reduce_xor", "parallelism_model": "cuda", "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [false, false, false, true]\n output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {"}
{"problem_type": "reduce", "language": "cpp", "name": "26_reduce_product_of_inverses", "parallelism_model": "cuda", "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n Store the result in product.\n Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [4, 2, 10, 4, 5]\n output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {"}
{"problem_type": "reduce", "language": "cpp", "name": "27_reduce_average", "parallelism_model": "cuda", "prompt": "/* Compute the average of the vector x. Store the result in average.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Examples:\n \n\t input: [1, 8, 4, 5, 1]\n output: 3.8\n\n input: [2, 2, 2, 3]\n output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {"}
{"problem_type": "reduce", "language": "cpp", "name": "28_reduce_smallest_odd_number", "parallelism_model": "cuda", "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n Examples:\n\n input: [7, 9, 5, 2, 8, 16, 4, 1]\n output: 1\n\n input: [8, 36, 7, 2, 11]\n output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {"}
{"problem_type": "reduce", "language": "cpp", "name": "29_reduce_sum_of_min_of_pairs", "parallelism_model": "cuda", "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n Store the result in sum.\n Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {"}
{"problem_type": "scan", "language": "cpp", "name": "30_scan_prefix_sum", "parallelism_model": "cuda", "prompt": "/* Compute the prefix sum of the vector x into output.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n Example:\n \n input: [1, 7, 4, 6, 6, 2]\n output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {"}
{"problem_type": "scan", "language": "cpp", "name": "31_scan_scan_with_min_function", "parallelism_model": "cuda", "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Examples:\n\n input: [8, 6, -1, 7, 3, 4, 4]\n output: [8, 6, -1, -1, -1, -1, -1]\n\n input: [5, 4, 6, 4, 3, 6, 1, 1]\n output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {"}
{"problem_type": "scan", "language": "cpp", "name": "32_scan_sum_of_prefix_sum_array", "parallelism_model": "cuda", "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [-7, 2, 1, 9, 4, 8]\n output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {"}
{"problem_type": "scan", "language": "cpp", "name": "33_scan_reverse_prefix_sum", "parallelism_model": "cuda", "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Examples:\n \n input: [1, 7, 4, 6, 6, 2]\n output: [2, 8, 14, 18, 25, 26]\n\n input: [3, 3, 7, 1, -2]\n output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {"}
{"problem_type": "scan", "language": "cpp", "name": "34_scan_largest_contiguous_subarray_sum", "parallelism_model": "cuda", "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n subarray with the largest sum of 6.\n Store the result in sum.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {"}
{"problem_type": "search", "language": "cpp", "name": "35_search_search_for_last_struct_by_key", "parallelism_model": "cuda", "prompt": "struct Book {\n const char* title;\n int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n Store the result in lastShortBookIndex.\n Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {"}
{"problem_type": "search", "language": "cpp", "name": "36_search_check_if_array_contains_value", "parallelism_model": "cuda", "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n Use CUDA to search in parallel. The kernel is launched with at least N threads.\n Examples:\n\n input: x=[1, 8, 2, 6, 4, 6], target=3\n output: false\n \n input: x=[1, 8, 2, 6, 4, 6], target=8\n output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {"}
{"problem_type": "search", "language": "cpp", "name": "37_search_find_the_closest_number_to_pi", "parallelism_model": "cuda", "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n Use M_PI for the value of PI.\n Use CUDA to search in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {"}
{"problem_type": "search", "language": "cpp", "name": "38_search_find_the_first_even_number", "parallelism_model": "cuda", "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n Examples:\n\n input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n output: 6\n\n input: [3, 8, 9, 9, 3, 4, 8, 6]\n output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {"}
{"problem_type": "search", "language": "cpp", "name": "39_search_xor_contains", "parallelism_model": "cuda", "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n Set it to false if it is in both or neither.\n Use CUDA to search in parallel. The kernel is launched with at least N threads.\n Examples:\n\n input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n output: true\n\n input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {"}
{"problem_type": "sort", "language": "cpp", "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude", "parallelism_model": "cuda", "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n Example:\n \n input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {"}
{"problem_type": "sort", "language": "cpp", "name": "41_sort_k-th_smallest_element", "parallelism_model": "cuda", "prompt": "/* Find the k-th smallest element of the vector x.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n \n input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {"}
{"problem_type": "sort", "language": "cpp", "name": "42_sort_sorted_ranks", "parallelism_model": "cuda", "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n Store the results in `ranks`.\n Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n Examples:\n\n input: [3.1, 2.8, 9.1, 0.4, 3.14]\n output: [2, 1, 4, 0, 3]\n \n input: [100, 7.6, 16.1, 18, 7.6]\n output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {"}
{"problem_type": "sort", "language": "cpp", "name": "43_sort_sort_an_array_of_structs_by_key", "parallelism_model": "cuda", "prompt": "struct Result {\n int startTime, duration;\n float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n Example:\n \n input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {"}
{"problem_type": "sort", "language": "cpp", "name": "44_sort_sort_non-zero_elements", "parallelism_model": "cuda", "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n Example:\n\n input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {"}
{"problem_type": "sparse_la", "language": "cpp", "name": "45_sparse_la_sparse_solve", "parallelism_model": "cuda", "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n Example:\n \n input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {"}
{"problem_type": "sparse_la", "language": "cpp", "name": "46_sparse_la_spmm", "parallelism_model": "cuda", "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n Example:\n\n input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {"}
{"problem_type": "sparse_la", "language": "cpp", "name": "47_sparse_la_spmv", "parallelism_model": "cuda", "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n and A is a sparse matrix stored in COO format with sizeA elements.\n A has dimensions MxN, x has N values, and y has M values.\n Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n Example:\n\n input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {"}
{"problem_type": "sparse_la", "language": "cpp", "name": "48_sparse_la_sparse_axpy", "parallelism_model": "cuda", "prompt": "struct Element {\n\tsize_t index;\n double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n Example:\n \n input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {"}
{"problem_type": "sparse_la", "language": "cpp", "name": "49_sparse_la_sparse_lu_decomp", "parallelism_model": "cuda", "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n A is a sparse NxN matrix stored in COO format with sizeA elements.\n Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n Example:\n\n input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {"}
{"problem_type": "stencil", "language": "cpp", "name": "50_stencil_xor_kernel", "parallelism_model": "cuda", "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n input and output are NxN grids of ints in row-major.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n input: [[0, 1, 1, 0],\n [1, 0, 0, 0],\n [0, 0, 0, 0],\n [0, 1, 0, 0]\n output: [[0, 0, 1, 1],\n [1, 0, 0, 1],\n [0, 0, 1, 0],\n [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {"}
{"problem_type": "stencil", "language": "cpp", "name": "51_stencil_edge_kernel", "parallelism_model": "cuda", "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n the dot product of itself and its neighbors with the edge kernel.\n Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n imageIn and imageOut are NxN grayscale images stored in row-major.\n Store the output of the computation in imageOut.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n input: [[112, 118, 141, 152],\n [93, 101, 119, 203],\n [45, 17, 16, 232],\n [82, 31, 49, 101]]\n output: [[255, 255, 255, 255],\n [255, 147, 0, 255],\n [36, 0, 0, 255],\n [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {"}
{"problem_type": "stencil", "language": "cpp", "name": "52_stencil_1d_jacobi_3-point_stencil", "parallelism_model": "cuda", "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n Replace with 0 when reading past the boundaries of `input`.\n Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n Example:\n\n input: [9, -6, -1, 2, 3]\n output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {"}
{"problem_type": "stencil", "language": "cpp", "name": "53_stencil_2d_jacobi_5-point_stencil", "parallelism_model": "cuda", "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n Replace with 0 when reading past the boundaries of `input`.\n `input` and `output` are NxN grids stored in row-major.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {"}
{"problem_type": "stencil", "language": "cpp", "name": "54_stencil_game_of_life", "parallelism_model": "cuda", "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n A cell is 1 if it is alive and 0 if it is dead.\n If a live cell has fewer than 2 live neighbors then it dies.\n If a live cell has 2 or 3 live neighbors then it lives on.\n If a live cell has more than 3 live neighbords then it dies.\n If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n `input` and `output` are NxN grids stored in row-major.\n Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n Example:\n\n input: [[0, 0, 0, 0, 0],\n\t\t\t\t\t [0, 1, 0, 0, 0],\n [0, 1, 1, 0, 0],\n [0, 0, 1, 1, 0],\n [0, 1, 0, 0, 0]]\n output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t [0, 1, 1, 0, 0],\n [0, 1, 0, 1, 0],\n [0, 0, 0, 1, 0],\n [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {"}
{"problem_type": "transform", "language": "cpp", "name": "55_transform_relu", "parallelism_model": "cuda", "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n while elements greater than zero stay the same.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {"}
{"problem_type": "transform", "language": "cpp", "name": "56_transform_negate_odds", "parallelism_model": "cuda", "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [16, 11, 12, 14, 1, 0, 5]\n output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {"}
{"problem_type": "transform", "language": "cpp", "name": "57_transform_inverse_offset", "parallelism_model": "cuda", "prompt": "/* Replace every element of the vector x with 1-1/x.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n Example:\n\n input: [2, 4, 1, 12, -2]\n output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {"}
{"problem_type": "transform", "language": "cpp", "name": "58_transform_squaring", "parallelism_model": "cuda", "prompt": "/* Replace every element of x with the square of its value.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n Example:\n\n input: [5, 1, 2, -4, 8]\n output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {"}
{"problem_type": "transform", "language": "cpp", "name": "59_transform_map_function", "parallelism_model": "cuda", "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n Example:\n\n input: [8, 0, 9, 7, 15, 64, 3]\n output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {"}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment