void gemm_full_block(double* restrict A, // rdi double* restrict B, // rsi double* restrict C, // rdx const uint32_t inner_k, // ecx const uint32_t AB_stride, // r8 const uint32_t C_stride, // r9 uint32_t i, uint32_t j, uint32_t k) { __asm__( "subq $64, %%rsp\n\t" "movl 16(%%rbp), %%eax\n\t" // eax: i "movl 24(%%rbp), %%ebx\n\t" // ebx: j "movl 32(%%rbp), %%r10d\n\t" // r10d: k "leaq (%%rdx, %%rbx, 8), %%rdx\n\t" // C = C + J*8 // Save max_i to -8 rsp. max_i = i + block_size_m "movl %%eax, %%r15d\n\t" "addl $" STRINGIFY(BLOCK_SIZE_M) ", %%r15d\n\t" "movl %%r15d, -8(%%rsp)\n\t" // Save max_j to -16 rsp. max_j = j + block_size_n "movl %%ebx, %%r15d\n\t" "addl $" STRINGIFY(BLOCK_SIZE_N) ", %%r15d\n\t" "movl %%r15d, -16(%%rsp)\n\t" // add k to A and B "leaq (%%rdi, %%r10, 8), %%rdi\n\t" "leaq (%%rsi, %%r10, 8), %%rsi\n\t" ".align 4, 0x90\n\t" "1:\n\t" "movl 16(%%rbp), %%eax\n\t" // i = arg(i) ".align 4, 0x90\n\t" "2:\n\t" "movl %%eax, %%r11d\n\t" // r11: REG_OFFSET = i "imul %%r9d, %%r11d\n\t" // REG_OFFSET = REG_OFFSET * C_stride "movapd 0(%%rdx, %%r11, 8), %%xmm0\n\t" // xmm0(lo, high) <- (C0, C1) "movapd 16(%%rdx, %%r11, 8), %%xmm1\n\t" "addl %%r9d, %%r11d\n\t" "movapd 0(%%rdx, %%r11, 8), %%xmm2\n\t" "movapd 16(%%rdx, %%r11, 8), %%xmm3\n\t" "addl %%r9d, %%r11d\n\t" "movapd 0(%%rdx, %%r11, 8), %%xmm4\n\t" "movapd 16(%%rdx, %%r11, 8), %%xmm5\n\t" "addl %%r9d, %%r11d\n\t" "movapd 0(%%rdx, %%r11, 8), %%xmm6\n\t" "movapd 16(%%rdx, %%r11, 8), %%xmm7\n\t" "movl %%eax, %%r15d\n\t" //r15 = i "imul %%r8d, %%r15d\n\t" //r15 = i * ab_stride "leaq (%%rdi, %%r15, 8), %%r14\n\t" // r14 = A + (i*ab_stride)*8 "movl %%ebx, %%r15d\n\t" //r15 = j "imul %%r8d, %%r15d\n\t" //r15 = j * ab_stride "leaq (%%rsi, %%r15, 8), %%r13\n\t" // r13 = B + (j*ab_stride)*8 "xorl %%r11d, %%r11d\n\t" // k = 0 ".align 4, 0x90\n\t" "3:\n\t" "movl %%r11d, %%r12d\n\t" // r12 = k "movapd (%%r14, %%r12, 8), %%xmm8\n\t" // A0 8:A0 9:xx 10:xx 11:xx 12:xx 13:xx 14:xx 15:xx "movapd (%%r13, %%r12, 8), %%xmm9\n\t" // B0 8:A0 9:B0 10:xx 11:xx 12:xx 13:xx 14:xx 15:xx "movapd %%xmm9, %%xmm15\n\t" // copy B0 8:A0 9:B0 10:xx 11:xx 12:xx 13:xx 14:xx 15:B0 "dppd $0x31, %%xmm8, %%xmm9\n\t" // C0 = A0 * B0 8:A0 9:C0 10:xx 11:xx 12:xx 13:xx 14:xx 15:B0 "addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*1 "movapd (%%r13, %%r12, 8), %%xmm11\n\t" // B1 8:A0 9:C0 10:xx 11:B1 12:xx 13:xx 14:xx 15:B0 "movapd %%xmm11, %%xmm14\n\t" // copy B1 8:A0 9:C0 10:xx 11:B1 12:xx 13:xx 14:B1 15:B0 "dppd $0x32, %%xmm8, %%xmm11\n\t" // C1 = A0 * B1 8:A0 9:C0 10:xx 11:C1 12:xx 13:xx 14:B1 15:B0 "movapd (%%r14, %%r12, 8), %%xmm10\n\t" // A1 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:B1 15:B0 "dppd $0x31, %%xmm10, %%xmm15\n\t" // C4 = A1 * B0 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:B1 15:C4 "dppd $0x32, %%xmm10, %%xmm14\n\t" // C5 = A1 * B1 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:C5 15:C4 "addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*2 "movapd (%%r13, %%r12, 8), %%xmm12\n\t" // B2 8:A0 9:C0 10:A1 11:C1 12:B2 13:xx 14:C5 15:C4 "addpd %%xmm9, %%xmm0\n\t" // Flush C0 8:A0 9:xx 10:A1 11:C1 12:B2 13:xx 14:C5 15:C4 "movapd (%%r14, %%r12, 8), %%xmm13\n\t" // A2 8:A0 9:xx 10:A1 11:C1 12:B2 13:A2 14:C5 15:C4 "movapd %%xmm12, %%xmm9\n\t" // copy B2 8:A0 9:B2 10:A1 11:C1 12:B2 13:A2 14:C5 15:C4 "dppd $0x31, %%xmm8, %%xmm12\n\t" // C2 8:A0 9:B2 10:A1 11:C1 12:C2 13:A2 14:C5 15:C4 "addpd %%xmm11, %%xmm0\n\t" // Flush C1 8:A0 9:B2 10:A1 11:xx 12:C2 13:A2 14:C5 15:C4 "addpd %%xmm15, %%xmm2\n\t" // Flush C4 8:A0 9:B2 10:A1 11:xx 12:C2 13:A2 14:C5 15:xx "movapd %%xmm9, %%xmm11\n\t" // copy B2 8:A0 9:B2 10:A1 11:B2 12:C2 13:A2 14:C5 15:xx "dppd $0x31, %%xmm10, %%xmm9\n\t" // C6 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:C5 15:xx "addpd %%xmm14, %%xmm2\n\t" // Flush C5 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:xx 15:xx "addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*3 "movapd (%%r13, %%r12, 8), %%xmm14\n\t" // B3 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:xx "movapd (%%r14, %%r12, 8), %%xmm15\n\t" // A3 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:A3 "dppd $0x32, %%xmm14, %%xmm8\n\t" // C3 8:C3 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:A3 "dppd $0x32, %%xmm14, %%xmm10\n\t" // C7 8:C3 9:C6 10:C7 11:B2 12:C2 13:A2 14:B3 15:A3 "addpd %%xmm12, %%xmm1\n\t" // Flush C2 8:C3 9:C6 10:C7 11:B2 12:xx 13:A2 14:B3 15:A3 "movapd %%xmm11, %%xmm12\n\t" // copy B2 8:C3 9:C6 10:C7 11:B2 12:B2 13:A2 14:B3 15:A3 "dppd $0x31, %%xmm13, %%xmm11\n\t" // C10 8:C3 9:C6 10:C7 11:C10 12:B2 13:A2 14:B3 15:A3 "dppd $0x31, %%xmm15, %%xmm12\n\t" // C14 8:C3 9:C6 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3 "addpd %%xmm9, %%xmm3\n\t" // Flush C6 8:C3 9:xx 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3 "movapd %%xmm14, %%xmm9\n\t" // copy B3 8:C3 9:B3 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3 "dppd $0x32, %%xmm13, %%xmm14\n\t" // C11 8:C3 9:B3 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 "dppd $0x32, %%xmm15, %%xmm9\n\t" // C15 8:C3 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 "addpd %%xmm8, %%xmm1\n\t" // Flush C3 8:xx 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 "subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*2 "subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*1 "movapd (%%r13, %%r12, 8), %%xmm8\n\t" // B1 8:B1 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 "addpd %%xmm10, %%xmm3\n\t" // Flush C7 8:B1 9:C15 10:xx 11:C10 12:C14 13:A2 14:C11 15:A3 "movapd %%xmm8, %%xmm10\n\t" // copy B1 8:B1 9:C15 10:B1 11:C10 12:C14 13:A2 14:C11 15:A3 "dppd $0x32, %%xmm13, %%xmm8\n\t" // C9 8:C9 9:C15 10:B1 11:C10 12:C14 13:A2 14:C11 15:A3 "dppd $0x32, %%xmm15, %%xmm10\n\t" // C13 8:C9 9:C15 10:C13 11:C10 12:C14 13:A2 14:C11 15:A3 "addpd %%xmm11, %%xmm5\n\t" // Flush C10 8:C9 9:C15 10:C13 11:xx 12:C14 13:A2 14:C11 15:A3 "addpd %%xmm12, %%xmm7\n\t" // Flush C14 8:C9 9:C15 10:C13 11:xx 12:xx 13:A2 14:C11 15:A3 "subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*0 TODO: make mov from k "movapd (%%r13, %%r12, 8), %%xmm11\n\t" // B0 8:C9 9:C15 10:C13 11:B0 12:xx 13:A2 14:C11 15:A3 "addpd %%xmm14, %%xmm5\n\t" // Flush C11 8:C9 9:C15 10:C13 11:B0 12:xx 13:A2 14:xx 15:A3 "addpd %%xmm9, %%xmm7\n\t" // Flush C15 8:C9 9:xx 10:C13 11:B0 12:xx 13:A2 14:xx 15:A3 "movapd %%xmm11, %%xmm12\n\t" // copy B0 8:C9 9:xx 10:C13 11:B0 12:B0 13:A2 14:xx 15:A3 "dppd $0x31, %%xmm13, %%xmm11\n\t" // C8 8:C9 9:xx 10:C13 11:C8 12:B0 13:A2 14:xx 15:A3 "dppd $0x31, %%xmm15, %%xmm12\n\t" // C12 8:C9 9:xx 10:C13 11:C8 12:C12 13:A2 14:xx 15:A3 "addpd %%xmm8, %%xmm4\n\t" // Flush C9 8:xx 9:xx 10:C13 11:C8 12:C12 13:A2 14:xx 15:A3 "addpd %%xmm10, %%xmm6\n\t" // Flush C13 8:xx 9:xx 10:xx 11:C8 12:C12 13:A2 14:xx 15:A3 "addpd %%xmm11, %%xmm4\n\t" // Flush C8 8:xx 9:xx 10:xx 11:xx 12:C12 13:A2 14:xx 15:A3 "addpd %%xmm12, %%xmm6\n\t" // Flush C12 8:xx 9:xx 10:xx 11:xx 12:xx 13:A2 14:xx 15:A3 "addl $2, %%r11d\n\t" "cmpl $" STRINGIFY(BLOCK_SIZE_K) ", %%r11d\n\t" "jne 3b\n\t" "movl %%eax, %%r11d\n\t" // r11: REG_OFFSET = i * C_stride "imul %%r9d, %%r11d\n\t" "movapd %%xmm0, 0(%%rdx, %%r11, 8)\n\t" "movapd %%xmm1, 16(%%rdx, %%r11, 8)\n\t" "addl %%r9d, %%r11d\n\t" "movapd %%xmm2, 0(%%rdx, %%r11, 8)\n\t" "movapd %%xmm3, 16(%%rdx, %%r11, 8)\n\t" "addl %%r9d, %%r11d\n\t" "movapd %%xmm4, 0(%%rdx, %%r11, 8)\n\t" "movapd %%xmm5, 16(%%rdx, %%r11, 8)\n\t" "addl %%r9d, %%r11d\n\t" "movapd %%xmm6, 0(%%rdx, %%r11, 8)\n\t" "movapd %%xmm7, 16(%%rdx, %%r11, 8)\n\t" "movl -8(%%rsp), %%r15d\n\t" // r15 = max i "addl $4, %%eax\n\t" // i = i + 4 "cmpl %%r15d, %%eax\n\t" "jb 2b\n\t" "addq $32, %%rdx\n\t" // C = C + register size * 8 "movl -16(%%rsp), %%r15d\n\t" // r15 = max j "addl $4, %%ebx\n\t" // j = j + 4 "cmpl %%r15d, %%ebx\n\t" "jb 1b\n\t" "addq $64, %%rsp\n\t" : // Outputs // none : // Inputs // No inputs -- I'll gather them myself : // Clobbered "%r15", "%r14", "%r13", "%r12", "%r11", "%r10", "%r9", "%r8", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi" ); }