/* This D source file is a multiple threaded implementation to perform an * extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N. * * Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1. * Output is the number of twin primes <= N, or in range N1 to N2; the last * twin prime value for the range; and the total time of execution. * * Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu, * 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning * would be needed to optimize for other hadware systems (ARM, PowerPC, etc). * * To compile with ldc2: $ ldc2 --release -O3 -mcpu native twinprimes_ssozgist2025feb05.d * To reduce binary size do: $ strip twinprimes_ssoz * Then run as: $ ./twinprimes_ssoz * Larger or smaller value first, and underscores allowed: 123456 or 123_456. * * This D source file, and updates, will be available here: * https://gist.github.com/jzakiya/ae93bfa03dbc8b25ccc7f97ff8ad0f61 * * Mathematical and technical basis for implementation are explained here: * https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_ * Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_ * for_the_Riemann_Hypotheses * https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_ * https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK * https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained * * This code is provided free and subject to copyright and terms of the * GNU General Public License Version 3, GPLv3, or greater. * License copy/terms are here: http://www.gnu.org/licenses/ * * Copyright (c) 2017-2025 Jabari Zakiya -- jzakiya at gmail dot com * Version Date: 2025/02/05 */ module twinprimes_ssoz; import std; import std.datetime.stopwatch : StopWatch; import core.bitop: popcnt; import core.atomic: atomicOp; /// Compute modular inverse a^-1 of a to base b, e.g. a*(a^-1) mod b = 1. int modinv(int a0, int b0) pure @nogc { int a = a0; int b = b0; int x0 = 0; int result = 1; if (b == 1) { return result; } while (a > 1) { immutable q = a / b; a = a % b; swap(a, b); result -= q * x0; swap(x0, result); } if (result < 0) { result += b0; } return result; } // Global parameters shared uint Ks = 0; // segment size for each seg restrack shared ulong start_num; // lo number for range shared ulong end_num; // hi number for range shared ulong Kmax; // number of resgroups to end_num shared ulong Kmin; // number of resgroups to start_num shared ulong[] primes; // list of primes r1..sqrt(N) shared uint[] cnts; // holds twin primes count for each tp|thread shared ulong[] lastwins; // holds largest twin prime for each tp|thread shared uint modpg; // PG's modulus value shared uint res_0; // PG's first residue value shared ulong[] restwins; // PG's list of twinpair residues shared ulong[] resinvrs; // PG's list of residues inverses shared uint bn; // segment size factor for PG and input number /// Creates constant parameters for selected Prime Generator at runtime. void genPGparameters(int prime) { writeln("using parameters for P", prime); static immutable primes = [2, 3, 5, 7, 11, 13, 17, 19, 23]; uint modpn = 1; uint res0 = 0; // compute Pn's modulus and res_0 value foreach (prm; primes) { res0 = prm; if (prm > prime) break; modpn *= prm; } restwins = []; // save upper twin pair residues here resinvrs = new ulong[modpn + 2]; // save Pn's residues inverses here uint rc = 5; // use P3's PGS to generate residue candidates uint inc = 2; // init value in its seq: 2 4 2 4 2 4 ... auto res = 0; // init residue value while (rc < (modpn >> 1)) { // find PG's 1st half residues if (gcd(rc, modpn) == 1) { // if rc is a residue auto mc = modpn - rc; // create its modular complement resinvrs[rc] = modinv(rc,modpn); // save rc and mc inverses resinvrs[mc] = modinv(mc,modpn); // if in twinpair save both hi residues if (res + 2 == rc) { restwins ~= rc; restwins ~= mc + 2; } res = rc; // save current found residue } rc += inc; inc = inc ^ 0b110; // create next P3 sequence rc: 5 7 11 13 17 19 ... } restwins.sort; restwins ~= modpn + 1; // last residue is last hi_tp resinvrs[modpn + 1] = 1; resinvrs[modpn - 1] = modpn - 1; // last 2 residues are self inverses modpg = modpn; res_0 = res0; // save global variables } /// Select at runtime best PG and segment size factor to use for input range. /// These are good estimates derived from PG data profiling. Can be improved. void setSieveParameters(ulong start_range, ulong end_range) { ulong range = end_range - start_range; int pg = 3; if (end_range < 49) { bn = 1; pg = 3; } else if (range < 77_000_000) { bn = 16; pg = 5; } else if (range < 1_100_000_000) { bn = 32; pg = 7; } else if (range < 35_500_000_000uL) { bn = 64; pg = 11; } else if (range < 14_000_000_000_000uL) { pg = 13; if (range > 7_000_000_000_000uL) { bn = 384; } else if (range > 2_500_000_000_000uL) { bn = 320; } else if (range > 250_000_000_000uL) { bn = 196; } else { bn = 128; } } else { bn = 384; pg = 17; } genPGparameters(pg); auto pairscnt = restwins.length; // number of twin pairs|threads for selected PG Kmin = (start_num-2) / modpg + 1; // number of resgroups to start_num Kmax = (end_num - 2) / modpg + 1; // number of resgroups to end_num auto krange = Kmax - Kmin + 1; // number of range resgroups, at least 1 auto n = (krange < 37_500_000_000_000uL) ? 10 : (krange < 950_000_000_000_000uL) ? 16 : 20; auto b = bn * 1024 * n; // set seg size to optimize for selected PG Ks = cast(uint)((krange < b) ? krange : b); // segments resgroups size writefln("segment size = %,d resgroups; seg array is [1 x %,d] 64-bits", Ks, ((Ks-1) >> 6) + 1,); auto maxpairs = krange * pairscnt; // maximum number of twin prime pcs writefln("twinprime candidates = %,d; resgroups = %,d", maxpairs, krange); } /// Return the primes r0..sqrt(end_num) within range (start_num...end_num) void sozP5(ulong val) { uint md = 30; // P5's modulus value ulong rescnt = 8; // P5's residue count static immutable res = [7, 11, 13, 17, 19, 23, 29, 31]; static immutable bitn = [0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128]; ulong range_size = end_num - start_num;// integers size of inputs range ulong kmax = (val - 2) / md + 1; // number of resgroups upto input value ubyte[] prms = new ubyte[](kmax); // byte array of prime candidates init '0' ulong sqrtn = cast(ulong) sqrt(cast(double) val); // integer sqrt of val auto k = sqrtn/rescnt; auto resk = sqrtn-md*k; auto rr=0; // sqrtn resgroup|residue values, 1st res posn while (resk >= res[rr]) rr++; // find largest residue <= sqrtn posn in its resgroup auto pcs_to_sqrtn = k*rescnt + rr; // number of pcs <= sqrtn foreach (i; 0..pcs_to_sqrtn) { // for r0..sqrtN primes mark their multiples k = i/rescnt; auto r = i%rescnt; // update resgroup parameters if ((prms[k] & (1 << r)) != 0) continue; // skip pc if not prime auto prm_r = res[r]; // if prime save its residue value ulong prime = md*k + prm_r; // numerate its value auto rem = start_num % prime; // prime's modular distance to start_num if (!(prime - rem <= range_size || rem == 0)) continue; // skip prime if no multiple in range foreach (ri; res) { // mark prime's multiples in prms uint prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair ubyte bit_r = cast(ubyte) bitn[prod % md]; // bit mask value for prod's residue ulong kpm = k * (prime + ri) + prod / md; // resgroup for prime's 1st multiple while (kpm < kmax) { prms[kpm] |= bit_r; kpm += prime; } // mark prime's multiples } } // prms now contains the prime multiples positions marked for the pcs r0..N // now along each restrack, identify|numerate the primes in each resgroup // return only the primes with a multiple within range (start_num...end_num) primes = []; // create empty dynamic array for primes foreach (i, res_i; res) { // along each restrack|row til end foreach (kn, resgroup; prms) { // for each resgroup along restrack if ((resgroup & (1 << i)) == 0) { // if bit location a prime ulong prime = md * kn + res_i; // numerate its value, store if in range // check if prime has multiple in range, if so keep it, if not don't ulong rem = start_num % prime; // if rem is 0 then start_num multiple of prime if (((res_0 <= prime) && (prime <= val)) && (prime - rem <= range_size || rem == 0)) { primes ~= prime; } } } } } // primes stored in restrack|row sequence order /// Initialize 'nextp' array for given twin pair residues for thread. /// Compute 1st prime multiple resgroups in range for each prime r1..sqrt(N) /// and store consecutively as lo_tp|hi_tp pairs for their restracks. ulong[] nextp_init(ulong hi_r, ulong k_min) { ulong[] nextp = new ulong[](primes.length * 2); // 1st mults array for tp ulong r_hi = hi_r; // upper twin pair value ulong r_lo = hi_r - 2; // lower twin pair value foreach(size_t j, prime; primes) { // for each prime r1..sqrt(N) auto k = (prime - 2) / modpg; // find the resgroup it's in auto r = (prime - 2) % modpg + 2; // and its residue value auto r_inv = resinvrs[r]; // and its residue inverse auto rl = (r_lo * r_inv - 2) % modpg + 2; // compute the ri for r for lo_cp auto rh = (r_hi * r_inv - 2) % modpg + 2; // compute the ri for r for hi_tp auto kl = k * (prime + rl) + (r * rl - 2) / modpg; // kl 1st mult resgroup auto kh = k * (prime + rh) + (r * rh - 2) / modpg; // kh 1st mult resgroup if (kl < k_min) { // if 1st mult index < start_num's kl = (k_min - kl) % prime; // how many indices short is it if (kl > 0) kl = prime - kl; // adjust index value into range } else { kl = kl - k_min; } // else here, addjust index if it was > if (kh < k_min) { // if 1st mult index < start_num's kh = (k_min - kh) % prime; // how many indices short is it if (kh > 0) kh = prime - kh; // adjust index value into range } else { kh = kh - k_min; } // else here, addjust index if it was > nextp[j * 2] = kl; // lo_cp index val >= start of range nextp[j * 2 | 1] = kh; // hi_cp index val >= start of range } return nextp; } /// Perform in a thread, the ssoz for a given twinpair, for Kmax resgroups. /// First create|init 'nextp' array of 1st prime mults for given twin pair, /// (stored consequtively in 'nextp') and init seg byte array for Ks resgroups. /// For sieve, mark resgroup bits to '1' if either twinpair restrack is nonprime, /// for primes mults resgroups, and update 'nextp' restrack slices acccordingly. /// Find last twin prime|sum for range, store in their arrays for this twinpair. /// Can optionally compile to print mid twinprime values generated by twinpair. void twinsSieve(uint indx) { uint s = 6; // shift value for 64 bits uint bmask = (1 << s) - 1; // bitmask val for 64 bits auto sum = 0; // twins count for twin pair for segment ulong Ki = Kmin - 1; // first resgroup val for range for slice ulong K_max = Kmax; // last resgroup val for range for slice uint Kn = Ks; // resgroup seg size for each seg slice ulong hi_tp = 0; // value of largest tp hi prime in seg ulong r_hi = restwins[indx]; // hi tp residue for this thread ulong[] seg = new ulong[](((Ks-1) >> s) + 1); // seg array for Ks regroups if (r_hi - 2 < (start_num - 2) % modpg + 2) ++Ki; // ensure lo cp in range if (r_hi > (end_num - 2) % modpg + 2) --K_max; // ensure hi cp in range ulong[] nextp = nextp_init(r_hi, Ki); // 1st prime mults for twin pair restracks while (Ki < K_max) { // for Kn resgroup size slices upto Kmax Kn = min(Ks, K_max - Ki); // set segment slice resgroup length foreach (j, prime; primes) { // for each prime index r1..sqrt(N) // for lower pair residue track ulong k1 = nextp[j * 2]; // starting from this lower resgroup in seg while (k1 < Kn) { // mark primenth resgroup bits prime mults seg[k1 >> s] |= 1uL << (k1 & bmask); k1 += prime; } // set next prime multiple resgroup nextp[j * 2] = k1 - Kn; // save 1st resgroup in next eligible seg // for upper pair residue track ulong k2 = nextp[j * 2 | 1]; // starting from this upper resgroup in seg while (k2 < Kn) { // mark primenth resgroup prime mults seg[k2 >> s] |= 1uL << (k2 & bmask); k2 += prime; } // set next prime multiple resgroup nextp[j * 2 | 1] = k2 - Kn; // save 1st resgroup in next eligible seg } // set as nonprime unused bits in last seg[n] // so fast, do for every seg[i] seg[(Kn-1) >> s] = seg[(Kn-1) >> s] | (~1uL << ((Kn-1) & bmask)); auto cnt = 0; // count the cousinprimes in the segment foreach (i; 0..((Kn-1) >> s) + 1) { cnt += popcnt(~seg[i]); } if (cnt > 0) { // if segment has twin primes sum += cnt; // add the segment count to total count uint upk = Kn - 1; // from end of seg count backwards to largest tp while ((seg[upk >> s] & (1uL << (upk & bmask))) != 0) --upk; hi_tp = Ki + upk; // numerate its full resgroup value } Ki += Ks; // set 1st resgroup val of next seg slice if (Ki < K_max) {foreach (b; 0..((Kn-1) >> s) + 1) seg[b] = 0;} // set seg all primes } // numerate largest twin prime in seg hi_tp = ((r_hi > end_num) || (sum == 0)) ? 1 : hi_tp * modpg + r_hi; lastwins[indx] = hi_tp; // store final seg tp value cnts[indx] = sum; // sum for twin pair } /// Main routine to run twinprimes sieve, input ranges within 0..2^64 - 1. void twinPrimesSsoz(ulong[] vals) { end_num = max(vals[0], 3); start_num = (vals.length > 1) ? max(vals[1], 3) : 3; if (start_num > end_num) swap(start_num, end_num); start_num = start_num | 1; // if start_num even add 1 end_num = (end_num - 1) | 1; // if end_num even subtract 1 if (end_num - start_num < 2) { start_num = 7; end_num = 7; } writeln("threads = ", totalCPUs); auto timer1 = StopWatch(); timer1.start(); // start timing sieve setup execution setSieveParameters(start_num, end_num); auto pairscnt = restwins.length; auto krange = Kmax - Kmin + 1; // create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range if (end_num < 49) primes ~= 5; // generate sieving primes else sozP5(cast(ulong) sqrt(cast(double) end_num)); // <= sqrt(end_num) writefln("each of %,d threads has nextp[2 x %,d] array", pairscnt, primes.length); ulong twinscnt = 0; // number of twin primes for range auto lo_range = restwins[0] - 3; // lo_range = lo_tp - 1 foreach (tp; [3, 5, 11, 17]) { // excluded low tp values for PGs used if (end_num == 3) break; // if 3 end of range, no twin primes if ((start_num <= tp) && (tp < lo_range)) ++twinscnt; // count small tp if any } timer1.stop(); // sieve setup time writeln("setup time = ", timer1.peek()); writeln("perform twinprimes ssoz sieve"); auto timer2 = StopWatch(); // start timing ssoz sieve execution timer2.start(); cnts = new uint[](pairscnt); // count of tps for each thread lastwins = new ulong[](pairscnt); // largest hi_tp for each thread shared uint threadscnt = 0; // count of finished threads foreach (indx; parallel(iota(0, pairscnt))) { // do in parallel for each tp twinsSieve(cast(uint) indx); // sieve selected twinpair restracks write("\r", threadscnt.atomicOp!"+="(1), " of ", pairscnt, " twinpairs done"); } write("\r", pairscnt, " of ", pairscnt, " twinpairs done"); ulong last_twin = 0uL; foreach(i; 0..pairscnt) { twinscnt += cnts[i]; if (last_twin < lastwins[i]) last_twin = lastwins[i]; } if ((end_num == 5) && (twinscnt == 1)) last_twin = 5; auto Kn = krange % Ks; // set num of resgroups in last slice if (Kn == 0) Kn = Ks; // if mult of segsize set to seg size timer2.stop(); // sieve execution time writeln("\nsieve time = ", timer2.peek()); writeln("total time = ", timer2.peek() + timer1.peek()); writefln("last segment = %,d resgroups; segment slices = %,d", Kn, (krange-1) / Ks + 1); writefln("total twins = %,d; last twin = %,d+/-1", twinscnt, last_twin - 1, ); } void main(string[] args) { // directly recieve input from terminal string[] inputs = args[1..$]; // can include '_': 123_456 auto nums = inputs.map!(i => i.filter!(n => n != '_')); auto vals = nums.map!(f => f.to!ulong()).array; twinPrimesSsoz(vals); }