How to schedule double_buffer for local output to hide store latancy

Hi all,
When I schedule double_buffer to my local output buffer, it seems that all double_buffer scheduler does not work any more.

for (int i1 = 0; i1 < 8; ++i1) {
  A_local[i1] = A[i1]; 
}
for (int i11 = 0; i11 < 8; ++i11) {
  B_local[i11] = B[i11];
}
for (int i12 = 0; i12 < 8; ++i12) {
  C_local[i12] =(int)B_local[i12] + (int)A_local[i12];
}
for (int i1_outer_outer = 0; i1_outer_outer < 7; ++i1_outer_outer) {
  for (int i13 = 0; i13 < 8; ++i13) {
    A_local[((((i1_outer_outer + 1) % 2) * 8) + i13)] = A[(((i1_outer_outer * 8) + i13) + 8)];
  }
  for (int i14 = 0; i14 < 8; ++i14) {
    B_local[((((i1_outer_outer + 1) % 2) * 8) + i14)] = B[(((i1_outer_outer * 8) + i14) + 8)];
  }
  for (int i15 = 0; i15 < 8; ++i15) {
    C_local[((((i1_outer_outer + 1) % 2) * 8) + i15)] = (int)B_local[((((i1_outer_outer + 1) % 2) * 8) + i15)] +((int)A_local[((((i1_outer_outer + 1) % 2) * 8) + i15)];
  }
  for (int i1_inner = 0; i1_inner < 8; ++i1_inner) {
    C[((i1_outer_outer * 8) + i1_inner)] = C_local[(((i1_outer_outer % 2) * 8) + i1_inner)];
  }
}
for (int i1_inner1 = 0; i1_inner1 < 8; ++i1_inner1) {
  C[(i1_inner1 + 56)] = C_local[(i1_inner1 + 8)];
} 

Could you guys give me some advices to implement double_buffer scheduler? Or I have to add another pass to schedule double buffer for output data?Preformatted text

Double Buffer Pass maybe only used on GPU for input data pre-fetching. Please reference this link: