46 #define X(name) NFFT(name)
49 static inline INT intprod(
const INT *vec,
const INT a,
const INT d)
54 for (t = 0; t < d; t++)
61 #define BASE(x) CEXP(x)
77 static inline void sort0(
const INT d,
const INT *n,
const INT m,
78 const INT local_x_num,
const R *local_x, INT *ar_x)
80 INT u_j[d], i, j, help, rhigh;
84 for (i = 0; i < local_x_num; i++)
88 for (j = 0; j < d; j++)
90 help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
91 u_j[j] = (help % n[j] + n[j]) % n[j];
93 ar_x[2 * i] += u_j[j];
95 ar_x[2 * i] *= n[j + 1];
99 for (j = 0, nprod = 1; j < d; j++)
102 rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
104 ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) *
sizeof(INT));
105 Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
107 for (i = 1; i < local_x_num; i++)
108 assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
121 static inline void sort(
const X(plan) *ths)
123 if (ths->flags & NFFT_SORT_NODES)
124 sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
147 void X(trafo_direct)(
const X(plan) *ths)
149 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
151 memset(f, 0, (
size_t)(ths->M_total) *
sizeof(C));
158 #pragma omp parallel for default(shared) private(j)
160 for (j = 0; j < ths->M_total; j++)
163 for (k_L = 0; k_L < ths->N_total; k_L++)
165 R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
166 f[j] += f_hat[k_L] * BASE(-II * omega);
175 #pragma omp parallel for default(shared) private(j)
177 for (j = 0; j < ths->M_total; j++)
179 R x[ths->d], omega, Omega[ths->d + 1];
180 INT t, t2, k_L, k[ths->d];
182 for (t = 0; t < ths->d; t++)
185 x[t] = K2PI * ths->x[j * ths->d + t];
186 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
188 omega = Omega[ths->d];
190 for (k_L = 0; k_L < ths->N_total; k_L++)
192 f[j] += f_hat[k_L] * BASE(-II * omega);
194 for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
199 for (t2 = t; t2 < ths->d; t2++)
200 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
202 omega = Omega[ths->d];
209 void X(adjoint_direct)(
const X(plan) *ths)
211 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
213 memset(f_hat, 0, (
size_t)(ths->N_total) *
sizeof(C));
220 #pragma omp parallel for default(shared) private(k_L)
221 for (k_L = 0; k_L < ths->N_total; k_L++)
224 for (j = 0; j < ths->M_total; j++)
226 R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
227 f_hat[k_L] += f[j] * BASE(II * omega);
232 for (j = 0; j < ths->M_total; j++)
235 for (k_L = 0; k_L < ths->N_total; k_L++)
237 R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
238 f_hat[k_L] += f[j] * BASE(II * omega);
248 #pragma omp parallel for default(shared) private(j, k_L)
249 for (k_L = 0; k_L < ths->N_total; k_L++)
251 INT k[ths->d], k_temp, t;
255 for (t = ths->d - 1; t >= 0; t--)
257 k[t] = k_temp % ths->N[t] - ths->N[t]/2;
261 for (j = 0; j < ths->M_total; j++)
264 for (t = 0; t < ths->d; t++)
265 omega += k[t] * K2PI * ths->x[j * ths->d + t];
266 f_hat[k_L] += f[j] * BASE(II * omega);
270 for (j = 0; j < ths->M_total; j++)
272 R x[ths->d], omega, Omega[ths->d+1];
273 INT t, t2, k[ths->d];
275 for (t = 0; t < ths->d; t++)
278 x[t] = K2PI * ths->x[j * ths->d + t];
279 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
281 omega = Omega[ths->d];
282 for (k_L = 0; k_L < ths->N_total; k_L++)
284 f_hat[k_L] += f[j] * BASE(II * omega);
286 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
291 for (t2 = t; t2 < ths->d; t2++)
292 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
294 omega = Omega[ths->d];
326 static inline void uo(
const X(plan) *ths,
const INT j, INT *up, INT *op,
329 const R xj = ths->x[j * ths->d + act_dim];
330 INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
332 (*up) = c - (ths->m);
333 (*op) = c + 1 + (ths->m);
336 static inline void uo2(INT *u, INT *o,
const R x,
const INT n,
const INT m)
338 INT c = LRINT(FLOOR(x * (R)(n)));
340 *u = (c - m + n) % n;
341 *o = (c + 1 + m + n) % n;
344 #define MACRO_D_compute_A \
346 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
349 #define MACRO_D_compute_T \
351 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
354 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
356 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
358 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
360 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
362 #define MACRO_init_k_ks \
364 for (t = ths->d-1; 0 <= t; t--) \
367 ks[t] = ths->N[t]/2; \
372 #define MACRO_update_c_phi_inv_k(which_one) \
374 for (t2 = t; t2 < ths->d; t2++) \
376 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
377 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
378 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
382 #define MACRO_count_k_ks \
384 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
387 ks[t]= ths->N[t]/2; \
390 kp[t]++; k[t]++; ks[t]++; \
391 if(kp[t] == ths->N[t]/2) \
393 k[t] = ths->n[t] - ths->N[t]/2; \
399 #define MACRO_D(which_one) \
400 static inline void D_serial_ ## which_one (X(plan) *ths) \
403 R c_phi_inv_k[ths->d+1]; \
409 INT k_plain[ths->d+1]; \
410 INT ks_plain[ths->d+1]; \
412 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
413 MACRO_D_init_result_ ## which_one; \
415 c_phi_inv_k[0] = K(1.0); \
421 if (ths->flags & PRE_PHI_HUT) \
423 for (k_L = 0; k_L < ths->N_total; k_L++) \
425 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
426 MACRO_D_compute_ ## which_one; \
432 for (k_L = 0; k_L < ths->N_total; k_L++) \
434 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
435 MACRO_D_compute_ ## which_one; \
442 static inline void D_openmp_A(
X(plan) *ths)
447 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
448 memset(g_hat, 0, ths->n_total *
sizeof(C));
450 if (ths->flags & PRE_PHI_HUT)
452 #pragma omp parallel for default(shared) private(k_L)
453 for (k_L = 0; k_L < ths->N_total; k_L++)
458 R c_phi_inv_k_val = K(1.0);
460 INT ks_plain_val = 0;
464 for (t = ths->d-1; t >= 0; t--)
466 kp[t] = k_temp % ths->N[t];
467 if (kp[t] >= ths->N[t]/2)
468 k[t] = ths->n[t] - ths->N[t] + kp[t];
471 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
475 for (t = 0; t < ths->d; t++)
477 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
478 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
479 k_plain_val = k_plain_val*ths->n[t] + k[t];
482 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
487 #pragma omp parallel for default(shared) private(k_L)
488 for (k_L = 0; k_L < ths->N_total; k_L++)
493 R c_phi_inv_k_val = K(1.0);
495 INT ks_plain_val = 0;
499 for (t = ths->d-1; t >= 0; t--)
501 kp[t] = k_temp % ths->N[t];
502 if (kp[t] >= ths->N[t]/2)
503 k[t] = ths->n[t] - ths->N[t] + kp[t];
506 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
510 for (t = 0; t < ths->d; t++)
512 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
513 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
514 k_plain_val = k_plain_val*ths->n[t] + k[t];
517 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
527 static inline void D_A(
X(plan) *ths)
537 static void D_openmp_T(
X(plan) *ths)
542 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
543 memset(f_hat, 0, ths->N_total *
sizeof(C));
545 if (ths->flags & PRE_PHI_HUT)
547 #pragma omp parallel for default(shared) private(k_L)
548 for (k_L = 0; k_L < ths->N_total; k_L++)
553 R c_phi_inv_k_val = K(1.0);
555 INT ks_plain_val = 0;
559 for (t = ths->d - 1; t >= 0; t--)
561 kp[t] = k_temp % ths->N[t];
562 if (kp[t] >= ths->N[t]/2)
563 k[t] = ths->n[t] - ths->N[t] + kp[t];
566 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
570 for (t = 0; t < ths->d; t++)
572 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
573 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
574 k_plain_val = k_plain_val*ths->n[t] + k[t];
577 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
582 #pragma omp parallel for default(shared) private(k_L)
583 for (k_L = 0; k_L < ths->N_total; k_L++)
588 R c_phi_inv_k_val = K(1.0);
590 INT ks_plain_val = 0;
594 for (t = ths->d-1; t >= 0; t--)
596 kp[t] = k_temp % ths->N[t];
597 if (kp[t] >= ths->N[t]/2)
598 k[t] = ths->n[t] - ths->N[t] + kp[t];
601 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
605 for (t = 0; t < ths->d; t++)
607 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
608 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
609 k_plain_val = k_plain_val*ths->n[t] + k[t];
612 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
622 static void D_T(
X(plan) *ths)
632 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
633 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
635 #define MACRO_B_PRE_FULL_PSI_compute_A \
637 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
640 #define MACRO_B_PRE_FULL_PSI_compute_T \
642 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
645 #define MACRO_B_compute_A \
647 (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
650 #define MACRO_B_compute_T \
652 g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
655 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
657 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
659 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
660 - ((R)l[t2])/((R)ths->n[t2]), t2)
662 #define MACRO_init_uo_l_lj_t \
664 for (t = ths->d-1; t >= 0; t--) \
666 uo(ths,j,&u[t],&o[t],t); \
673 #define MACRO_update_phi_prod_ll_plain(which_one) { \
674 for (t2 = t; t2 < ths->d; t2++) \
676 phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
677 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
681 #define MACRO_count_uo_l_lj_t \
683 for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
693 #define MACRO_B(which_one) \
694 static inline void B_serial_ ## which_one (X(plan) *ths) \
697 INT u[ths->d], o[ths->d]; \
703 INT ll_plain[ths->d+1]; \
704 R phi_prod[ths->d+1]; \
708 R fg_psi[ths->d][2*ths->m+2]; \
709 R fg_exp_l[ths->d][2*ths->m+2]; \
711 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
714 INT ip_s = ths->K/(ths->m+2); \
716 f = (C*)ths->f; g = (C*)ths->g; \
718 MACRO_B_init_result_ ## which_one; \
720 if (ths->flags & PRE_FULL_PSI) \
722 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
724 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
726 MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
732 phi_prod[0] = K(1.0); \
735 for (t = 0, lprod = 1; t < ths->d; t++) \
736 lprod *= (2 * ths->m + 2); \
738 if (ths->flags & PRE_PSI) \
740 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
742 MACRO_init_uo_l_lj_t; \
744 for (l_L = 0; l_L < lprod; l_L++) \
746 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
748 MACRO_B_compute_ ## which_one; \
750 MACRO_count_uo_l_lj_t; \
756 if (ths->flags & PRE_FG_PSI) \
758 for(t2 = 0; t2 < ths->d; t2++) \
760 tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
761 tmpEXP2sq = tmpEXP2*tmpEXP2; \
764 fg_exp_l[t2][0] = K(1.0); \
765 for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
767 tmp3 = tmp2*tmpEXP2; \
769 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
772 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
774 MACRO_init_uo_l_lj_t; \
776 for (t2 = 0; t2 < ths->d; t2++) \
778 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
779 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
781 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
784 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
788 for (l_L= 0; l_L < lprod; l_L++) \
790 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
792 MACRO_B_compute_ ## which_one; \
794 MACRO_count_uo_l_lj_t; \
800 if (ths->flags & FG_PSI) \
802 for (t2 = 0; t2 < ths->d; t2++) \
804 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
805 tmpEXP2sq = tmpEXP2*tmpEXP2; \
808 fg_exp_l[t2][0] = K(1.0); \
809 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
811 tmp3 = tmp2*tmpEXP2; \
813 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
816 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
818 MACRO_init_uo_l_lj_t; \
820 for (t2 = 0; t2 < ths->d; t2++) \
822 fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
824 tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
827 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
830 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
834 for (l_L = 0; l_L < lprod; l_L++) \
836 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
838 MACRO_B_compute_ ## which_one; \
840 MACRO_count_uo_l_lj_t; \
846 if (ths->flags & PRE_LIN_PSI) \
848 for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
850 MACRO_init_uo_l_lj_t; \
852 for (t2 = 0; t2 < ths->d; t2++) \
854 y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
855 * ((R)(ths->K))) / (R)(ths->m + 2); \
856 ip_u = LRINT(FLOOR(y[t2])); \
858 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
860 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
861 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
866 for (l_L = 0; l_L < lprod; l_L++) \
868 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
870 MACRO_B_compute_ ## which_one; \
872 MACRO_count_uo_l_lj_t; \
879 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
881 MACRO_init_uo_l_lj_t; \
883 for (l_L = 0; l_L < lprod; l_L++) \
885 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
887 MACRO_B_compute_ ## which_one; \
889 MACRO_count_uo_l_lj_t; \
899 static inline void B_openmp_A (
X(plan) *ths)
904 memset(ths->f, 0, ths->M_total *
sizeof(C));
906 for (k = 0, lprod = 1; k < ths->d; k++)
907 lprod *= (2*ths->m+2);
909 if (ths->flags & PRE_FULL_PSI)
911 #pragma omp parallel for default(shared) private(k)
912 for (k = 0; k < ths->M_total; k++)
915 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
917 for (l = 0; l < lprod; l++)
918 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
923 if (ths->flags & PRE_PSI)
925 #pragma omp parallel for default(shared) private(k)
926 for (k = 0; k < ths->M_total; k++)
928 INT u[ths->d], o[ths->d];
933 INT ll_plain[ths->d+1];
934 R phi_prod[ths->d+1];
935 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
937 phi_prod[0] = K(1.0);
940 MACRO_init_uo_l_lj_t;
942 for (l_L = 0; l_L < lprod; l_L++)
944 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
946 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
948 MACRO_count_uo_l_lj_t;
954 if (ths->flags & PRE_FG_PSI)
957 R fg_exp_l[ths->d][2*ths->m+2];
959 for (t2 = 0; t2 < ths->d; t2++)
962 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
963 R tmpEXP2sq = tmpEXP2*tmpEXP2;
966 fg_exp_l[t2][0] = K(1.0);
967 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
971 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
975 #pragma omp parallel for default(shared) private(k,t,t2)
976 for (k = 0; k < ths->M_total; k++)
978 INT ll_plain[ths->d+1];
979 R phi_prod[ths->d+1];
980 INT u[ths->d], o[ths->d];
983 R fg_psi[ths->d][2*ths->m+2];
987 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
989 phi_prod[0] = K(1.0);
992 MACRO_init_uo_l_lj_t;
994 for (t2 = 0; t2 < ths->d; t2++)
996 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
997 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
999 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1002 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1006 for (l_L= 0; l_L < lprod; l_L++)
1008 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1010 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1012 MACRO_count_uo_l_lj_t;
1018 if (ths->flags & FG_PSI)
1021 R fg_exp_l[ths->d][2*ths->m+2];
1025 for (t2 = 0; t2 < ths->d; t2++)
1028 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1029 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1032 fg_exp_l[t2][0] = K(1.0);
1033 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1035 tmp3 = tmp2*tmpEXP2;
1037 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1041 #pragma omp parallel for default(shared) private(k,t,t2)
1042 for (k = 0; k < ths->M_total; k++)
1044 INT ll_plain[ths->d+1];
1045 R phi_prod[ths->d+1];
1046 INT u[ths->d], o[ths->d];
1049 R fg_psi[ths->d][2*ths->m+2];
1053 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1055 phi_prod[0] = K(1.0);
1058 MACRO_init_uo_l_lj_t;
1060 for (t2 = 0; t2 < ths->d; t2++)
1062 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1064 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1067 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1070 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1074 for (l_L = 0; l_L < lprod; l_L++)
1076 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1078 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1080 MACRO_count_uo_l_lj_t;
1086 if (ths->flags & PRE_LIN_PSI)
1090 #pragma omp parallel for default(shared) private(k)
1091 for (k = 0; k<ths->M_total; k++)
1093 INT u[ths->d], o[ths->d];
1098 INT ll_plain[ths->d+1];
1099 R phi_prod[ths->d+1];
1101 R fg_psi[ths->d][2*ths->m+2];
1105 INT ip_s = ths->K/(ths->m+2);
1106 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1108 phi_prod[0] = K(1.0);
1111 MACRO_init_uo_l_lj_t;
1113 for (t2 = 0; t2 < ths->d; t2++)
1115 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1116 * ((R)ths->K))/(ths->m+2);
1117 ip_u = LRINT(FLOOR(y[t2]));
1119 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1121 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1122 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1127 for (l_L = 0; l_L < lprod; l_L++)
1129 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1131 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1133 MACRO_count_uo_l_lj_t;
1142 #pragma omp parallel for default(shared) private(k)
1143 for (k = 0; k < ths->M_total; k++)
1145 INT u[ths->d], o[ths->d];
1150 INT ll_plain[ths->d+1];
1151 R phi_prod[ths->d+1];
1152 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1154 phi_prod[0] = K(1.0);
1157 MACRO_init_uo_l_lj_t;
1159 for (l_L = 0; l_L < lprod; l_L++)
1161 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1163 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1165 MACRO_count_uo_l_lj_t;
1171 static void B_A(
X(plan) *ths)
1196 static inline INT index_x_binary_search(
const INT *ar_x,
const INT len,
const INT key)
1198 INT left = 0, right = len - 1;
1203 while (left < right - 1)
1205 INT i = (left + right) / 2;
1206 if (ar_x[2*i] >= key)
1208 else if (ar_x[2*i] < key)
1212 if (ar_x[2*left] < key && left != len-1)
1235 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1236 INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b,
const INT d,
1237 const INT *n,
const INT m)
1239 const INT n0 = n[0];
1241 INT nthreads = omp_get_num_threads();
1242 INT nthreads_used = MIN(nthreads, n0);
1243 INT size_per_thread = n0 / nthreads_used;
1244 INT size_left = n0 - size_per_thread * nthreads_used;
1245 INT size_g[nthreads_used];
1246 INT offset_g[nthreads_used];
1247 INT my_id = omp_get_thread_num();
1248 INT n_prod_rest = 1;
1250 for (k = 1; k < d; k++)
1251 n_prod_rest *= n[k];
1260 if (my_id < nthreads_used)
1262 const INT m22 = 2 * m + 2;
1265 for (k = 0; k < nthreads_used; k++)
1268 offset_g[k] = offset_g[k-1] + size_g[k-1];
1269 size_g[k] = size_per_thread;
1277 *my_u0 = offset_g[my_id];
1278 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1280 if (nthreads_used > 1)
1282 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1283 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1288 *max_u_a = n_prod_rest * n0 - 1;
1293 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1294 *max_u_b = n_prod_rest * n0 - 1;
1298 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1300 *max_u_a = *max_u_b;
1305 assert(*min_u_a <= *max_u_a);
1306 assert(*min_u_b <= *max_u_b);
1307 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1321 static void nfft_adjoint_B_compute_full_psi(C *g,
const INT *psi_index_g,
1322 const R *psi,
const C *f,
const INT M,
const INT d,
const INT *n,
1323 const INT m,
const unsigned flags,
const INT *index_x)
1335 for(t = 0, lprod = 1; t < d; t++)
1339 lprod_m1 = lprod / (2 * m + 2);
1343 if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1345 #pragma omp parallel private(k)
1347 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1348 const INT *ar_x = index_x;
1349 INT n_prod_rest = 1;
1351 for (k = 1; k < d; k++)
1352 n_prod_rest *= n[k];
1354 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1358 k = index_x_binary_search(ar_x, M, min_u_a);
1360 assert(ar_x[2*k] >= min_u_a || k == M-1);
1362 assert(ar_x[2*k-2] < min_u_a);
1367 INT u_prod = ar_x[2*k];
1368 INT j = ar_x[2*k+1];
1370 if (u_prod < min_u_a || u_prod > max_u_a)
1373 for (l0 = 0; l0 < 2 * m + 2; l0++)
1375 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1377 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1380 for (lrest = 0; lrest < lprod_m1; lrest++)
1382 const INT l = l0 * lprod_m1 + lrest;
1383 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1393 k = index_x_binary_search(ar_x, M, min_u_b);
1395 assert(ar_x[2*k] >= min_u_b || k == M-1);
1397 assert(ar_x[2*k-2] < min_u_b);
1402 INT u_prod = ar_x[2*k];
1403 INT j = ar_x[2*k+1];
1405 if (u_prod < min_u_b || u_prod > max_u_b)
1408 for (l0 = 0; l0 < 2 * m + 2; l0++)
1410 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1412 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1414 for (lrest = 0; lrest < lprod_m1; lrest++)
1416 const INT l = l0 * lprod_m1 + lrest;
1417 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1430 #pragma omp parallel for default(shared) private(k)
1432 for (k = 0; k < M; k++)
1435 INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1437 for (l = 0; l < lprod; l++)
1440 C val = psi[j * lprod + l] * f[j];
1441 C *gref = g + psi_index_g[j * lprod + l];
1442 R *gref_real = (R*) gref;
1445 gref_real[0] += creal(val);
1448 gref_real[1] += cimag(val);
1450 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1461 static inline void B_openmp_T(
X(plan) *ths)
1466 memset(ths->g, 0, (
size_t)(ths->n_total) *
sizeof(C));
1468 for (k = 0, lprod = 1; k < ths->d; k++)
1469 lprod *= (2*ths->m+2);
1471 if (ths->flags & PRE_FULL_PSI)
1473 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1474 ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1478 if (ths->flags & PRE_PSI)
1480 #pragma omp parallel for default(shared) private(k)
1481 for (k = 0; k < ths->M_total; k++)
1483 INT u[ths->d], o[ths->d];
1488 INT ll_plain[ths->d+1];
1489 R phi_prod[ths->d+1];
1490 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1492 phi_prod[0] = K(1.0);
1495 MACRO_init_uo_l_lj_t;
1497 for (l_L = 0; l_L < lprod; l_L++)
1503 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1505 lhs = ths->g + ll_plain[ths->d];
1507 val = phi_prod[ths->d] * ths->f[j];
1510 lhs_real[0] += CREAL(val);
1513 lhs_real[1] += CIMAG(val);
1515 MACRO_count_uo_l_lj_t;
1521 if (ths->flags & PRE_FG_PSI)
1524 R fg_exp_l[ths->d][2*ths->m+2];
1525 for(t2 = 0; t2 < ths->d; t2++)
1528 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1529 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1532 fg_exp_l[t2][0] = K(1.0);
1533 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1535 tmp3 = tmp2*tmpEXP2;
1537 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1541 #pragma omp parallel for default(shared) private(k,t,t2)
1542 for (k = 0; k < ths->M_total; k++)
1544 INT ll_plain[ths->d+1];
1545 R phi_prod[ths->d+1];
1546 INT u[ths->d], o[ths->d];
1549 R fg_psi[ths->d][2*ths->m+2];
1553 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1555 phi_prod[0] = K(1.0);
1558 MACRO_init_uo_l_lj_t;
1560 for (t2 = 0; t2 < ths->d; t2++)
1562 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1563 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1565 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1568 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1572 for (l_L= 0; l_L < lprod; l_L++)
1578 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1580 lhs = ths->g + ll_plain[ths->d];
1582 val = phi_prod[ths->d] * ths->f[j];
1585 lhs_real[0] += CREAL(val);
1588 lhs_real[1] += CIMAG(val);
1590 MACRO_count_uo_l_lj_t;
1596 if (ths->flags & FG_PSI)
1599 R fg_exp_l[ths->d][2*ths->m+2];
1603 for (t2 = 0; t2 < ths->d; t2++)
1606 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1607 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1610 fg_exp_l[t2][0] = K(1.0);
1611 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1613 tmp3 = tmp2*tmpEXP2;
1615 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1619 #pragma omp parallel for default(shared) private(k,t,t2)
1620 for (k = 0; k < ths->M_total; k++)
1622 INT ll_plain[ths->d+1];
1623 R phi_prod[ths->d+1];
1624 INT u[ths->d], o[ths->d];
1627 R fg_psi[ths->d][2*ths->m+2];
1631 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1633 phi_prod[0] = K(1.0);
1636 MACRO_init_uo_l_lj_t;
1638 for (t2 = 0; t2 < ths->d; t2++)
1640 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1642 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1645 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1648 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1652 for (l_L = 0; l_L < lprod; l_L++)
1658 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1660 lhs = ths->g + ll_plain[ths->d];
1662 val = phi_prod[ths->d] * ths->f[j];
1665 lhs_real[0] += CREAL(val);
1668 lhs_real[1] += CIMAG(val);
1670 MACRO_count_uo_l_lj_t;
1676 if (ths->flags & PRE_LIN_PSI)
1680 #pragma omp parallel for default(shared) private(k)
1681 for (k = 0; k<ths->M_total; k++)
1683 INT u[ths->d], o[ths->d];
1688 INT ll_plain[ths->d+1];
1689 R phi_prod[ths->d+1];
1691 R fg_psi[ths->d][2*ths->m+2];
1695 INT ip_s = ths->K/(ths->m+2);
1696 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1698 phi_prod[0] = K(1.0);
1701 MACRO_init_uo_l_lj_t;
1703 for (t2 = 0; t2 < ths->d; t2++)
1705 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1706 * ((R)ths->K))/(ths->m+2);
1707 ip_u = LRINT(FLOOR(y[t2]));
1709 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1711 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1712 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1717 for (l_L = 0; l_L < lprod; l_L++)
1723 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1725 lhs = ths->g + ll_plain[ths->d];
1727 val = phi_prod[ths->d] * ths->f[j];
1730 lhs_real[0] += CREAL(val);
1733 lhs_real[1] += CIMAG(val);
1735 MACRO_count_uo_l_lj_t;
1744 #pragma omp parallel for default(shared) private(k)
1745 for (k = 0; k < ths->M_total; k++)
1747 INT u[ths->d], o[ths->d];
1752 INT ll_plain[ths->d+1];
1753 R phi_prod[ths->d+1];
1754 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1756 phi_prod[0] = K(1.0);
1759 MACRO_init_uo_l_lj_t;
1761 for (l_L = 0; l_L < lprod; l_L++)
1767 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1769 lhs = ths->g + ll_plain[ths->d];
1771 val = phi_prod[ths->d] * ths->f[j];
1774 lhs_real[0] += CREAL(val);
1777 lhs_real[1] += CIMAG(val);
1779 MACRO_count_uo_l_lj_t;
1785 static void B_T(
X(plan) *ths)
1796 static void nfft_1d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
1798 const INT tmp2 = 2*m+2;
1800 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1802 fg_exp_b0 = EXP(K(-1.0)/b);
1803 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1804 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1806 for (l = 1; l < tmp2; l++)
1808 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1809 fg_exp_b1 *= fg_exp_b0_sq;
1810 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1815 static void nfft_trafo_1d_compute(C *fj,
const C *g,
const R *psij_const,
1816 const R *xj,
const INT n,
const INT m)
1823 uo2(&u, &o, *xj, n, m);
1827 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1828 (*fj) += (*psij++) * (*gj++);
1832 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1833 (*fj) += (*psij++) * (*gj++);
1834 for (l = 0, gj = g; l <= o; l++)
1835 (*fj) += (*psij++) * (*gj++);
1840 static void nfft_adjoint_1d_compute_serial(
const C *fj, C *g,
1841 const R *psij_const,
const R *xj,
const INT n,
const INT m)
1848 uo2(&u,&o,*xj, n, m);
1852 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1853 (*gj++) += (*psij++) * (*fj);
1857 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1858 (*gj++) += (*psij++) * (*fj);
1859 for (l = 0, gj = g; l <= o; l++)
1860 (*gj++) += (*psij++) * (*fj);
1867 static void nfft_adjoint_1d_compute_omp_atomic(
const C f, C *g,
1868 const R *psij_const,
const R *xj,
const INT n,
const INT m)
1872 INT index_temp[2*m+2];
1874 uo2(&u,&o,*xj, n, m);
1876 for (l=0; l<=2*m+1; l++)
1877 index_temp[l] = (l+u)%n;
1879 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1881 INT i = index_temp[l];
1883 R *lhs_real = (R*)lhs;
1884 C val = psij_const[l] * f;
1886 lhs_real[0] += creal(val);
1889 lhs_real[1] += cimag(val);
1910 static void nfft_adjoint_1d_compute_omp_blockwise(
const C f, C *g,
1911 const R *psij_const,
const R *xj,
const INT n,
const INT m,
1912 const INT my_u0,
const INT my_o0)
1916 uo2(&ar_u,&ar_o,*xj, n, m);
1920 INT u = MAX(my_u0,ar_u);
1921 INT o = MIN(my_o0,ar_o);
1922 INT offset_psij = u-ar_u;
1924 assert(offset_psij >= 0);
1925 assert(o-u <= 2*m+1);
1926 assert(offset_psij+o-u <= 2*m+1);
1929 for (l = 0; l <= o-u; l++)
1930 g[u+l] += psij_const[offset_psij+l] * f;
1934 INT u = MAX(my_u0,ar_u);
1936 INT offset_psij = u-ar_u;
1938 assert(offset_psij >= 0);
1939 assert(o-u <= 2*m+1);
1940 assert(offset_psij+o-u <= 2*m+1);
1943 for (l = 0; l <= o-u; l++)
1944 g[u+l] += psij_const[offset_psij+l] * f;
1947 o = MIN(my_o0,ar_o);
1948 offset_psij += my_u0-ar_u+n;
1953 assert(o-u <= 2*m+1);
1954 if (offset_psij+o-u > 2*m+1)
1956 fprintf(stderr,
"ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
1958 assert(offset_psij+o-u <= 2*m+1);
1961 for (l = 0; l <= o-u; l++)
1962 g[u+l] += psij_const[offset_psij+l] * f;
1967 static void nfft_trafo_1d_B(
X(plan) *ths)
1969 const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
1970 const C *g = (C*)ths->g;
1972 if (ths->flags & PRE_FULL_PSI)
1976 #pragma omp parallel for default(shared) private(k)
1978 for (k = 0; k < M; k++)
1981 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1983 for (l = 0; l < m2p2; l++)
1984 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
1989 if (ths->flags & PRE_PSI)
1993 #pragma omp parallel for default(shared) private(k)
1995 for (k = 0; k < M; k++)
1997 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1998 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2004 if (ths->flags & PRE_FG_PSI)
2009 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2012 #pragma omp parallel for default(shared) private(k)
2014 for (k = 0; k < M; k++)
2016 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2017 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2018 R fg_psij2 = K(1.0);
2022 psij_const[0] = fg_psij0;
2024 for (l = 1; l < m2p2; l++)
2026 fg_psij2 *= fg_psij1;
2027 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2030 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2036 if (ths->flags & FG_PSI)
2043 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2046 #pragma omp parallel for default(shared) private(k)
2048 for (k = 0; k < M; k++)
2050 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2052 R fg_psij0, fg_psij1, fg_psij2;
2055 uo(ths, (INT)j, &u, &o, (INT)0);
2056 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2057 fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2060 psij_const[0] = fg_psij0;
2062 for (l = 1; l < m2p2; l++)
2064 fg_psij2 *= fg_psij1;
2065 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2068 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2073 if (ths->flags & PRE_LIN_PSI)
2075 const INT K = ths->K, ip_s = K / (m + 2);
2081 #pragma omp parallel for default(shared) private(k)
2083 for (k = 0; k < M; k++)
2089 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2091 uo(ths, (INT)j, &u, &o, (INT)0);
2093 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2094 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2095 ip_w = ip_y - (R)(ip_u);
2097 for (l = 0; l < m2p2; l++)
2098 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2099 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2101 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2113 #pragma omp parallel for default(shared) private(k)
2115 for (k = 0; k < M; k++)
2119 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2121 uo(ths, (INT)j, &u, &o, (INT)0);
2123 for (l = 0; l < m2p2; l++)
2124 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2126 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2133 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2135 assert(ar_x[2*k] >= min_u_a || k == M-1); \
2137 assert(ar_x[2*k-2] < min_u_a); \
2140 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2144 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2146 assert(ar_x[2*k] >= min_u_b || k == M-1); \
2148 assert(ar_x[2*k-2] < min_u_b); \
2151 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2154 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2156 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2157 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2160 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2162 R psij_const[2 * m + 2]; \
2164 R fg_psij0 = ths->psi[2 * j]; \
2165 R fg_psij1 = ths->psi[2 * j + 1]; \
2166 R fg_psij2 = K(1.0); \
2168 psij_const[0] = fg_psij0; \
2169 for (l = 1; l <= 2 * m + 1; l++) \
2171 fg_psij2 *= fg_psij1; \
2172 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2175 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2176 ths->x + j, n, m, my_u0, my_o0); \
2179 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2181 R psij_const[2 * m + 2]; \
2182 R fg_psij0, fg_psij1, fg_psij2; \
2185 uo(ths, j, &u, &o, (INT)0); \
2186 fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
2187 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2188 fg_psij2 = K(1.0); \
2189 psij_const[0] = fg_psij0; \
2190 for (l = 1; l <= 2 * m + 1; l++) \
2192 fg_psij2 *= fg_psij1; \
2193 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2196 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2197 ths->x + j, n, m, my_u0, my_o0); \
2200 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2202 R psij_const[2 * m + 2]; \
2207 uo(ths, j, &u, &o, (INT)0); \
2209 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2210 ip_u = LRINT(FLOOR(ip_y)); \
2211 ip_w = ip_y - ip_u; \
2212 for (l = 0; l < 2 * m + 2; l++) \
2214 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2215 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2217 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2218 ths->x + j, n, m, my_u0, my_o0); \
2221 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2223 R psij_const[2 * m + 2]; \
2226 uo(ths, j, &u, &o, (INT)0); \
2228 for (l = 0; l <= 2 * m + 1; l++) \
2229 psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
2231 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2232 ths->x + j, n, m, my_u0, my_o0); \
2235 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2237 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2239 _Pragma("omp parallel private(k)") \
2241 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2242 INT *ar_x = ths->index_x; \
2244 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2245 &min_u_b, &max_u_b, 1, &n, m); \
2247 if (min_u_a != -1) \
2249 k = index_x_binary_search(ar_x, M, min_u_a); \
2251 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2255 INT u_prod = ar_x[2*k]; \
2256 INT j = ar_x[2*k+1]; \
2258 if (u_prod < min_u_a || u_prod > max_u_a) \
2261 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2267 if (min_u_b != -1) \
2269 k = index_x_binary_search(ar_x, M, min_u_b); \
2271 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2275 INT u_prod = ar_x[2*k]; \
2276 INT j = ar_x[2*k+1]; \
2278 if (u_prod < min_u_b || u_prod > max_u_b) \
2281 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2291 static void nfft_adjoint_1d_B(
X(plan) *ths)
2293 const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2297 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
2299 if (ths->flags & PRE_FULL_PSI)
2301 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2302 (INT)1, ths->n, m, ths->flags, ths->index_x);
2306 if (ths->flags & PRE_PSI)
2309 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2313 #pragma omp parallel for default(shared) private(k)
2315 for (k = 0; k < M; k++)
2317 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2319 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2321 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2328 if (ths->flags & PRE_FG_PSI)
2330 R fg_exp_l[2 * m + 2];
2332 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2335 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2340 #pragma omp parallel for default(shared) private(k)
2342 for (k = 0; k < M; k++)
2344 R psij_const[2 * m + 2];
2345 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2347 R fg_psij0 = ths->psi[2 * j];
2348 R fg_psij1 = ths->psi[2 * j + 1];
2349 R fg_psij2 = K(1.0);
2351 psij_const[0] = fg_psij0;
2352 for (l = 1; l <= 2 * m + 1; l++)
2354 fg_psij2 *= fg_psij1;
2355 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2359 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2361 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2368 if (ths->flags & FG_PSI)
2370 R fg_exp_l[2 * m + 2];
2372 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2377 MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2381 #pragma omp parallel for default(shared) private(k)
2383 for (k = 0; k < M; k++)
2386 R psij_const[2 * m + 2];
2387 R fg_psij0, fg_psij1, fg_psij2;
2388 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2390 uo(ths, j, &u, &o, (INT)0);
2391 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2392 fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2394 psij_const[0] = fg_psij0;
2395 for (l = 1; l <= 2 * m + 1; l++)
2397 fg_psij2 *= fg_psij1;
2398 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2402 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2404 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2411 if (ths->flags & PRE_LIN_PSI)
2413 const INT K = ths->K;
2414 const INT ip_s = K / (m + 2);
2419 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2423 #pragma omp parallel for default(shared) private(k)
2425 for (k = 0; k < M; k++)
2430 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2431 R psij_const[2 * m + 2];
2433 uo(ths, j, &u, &o, (INT)0);
2435 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2436 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2437 ip_w = ip_y - (R)(ip_u);
2438 for (l = 0; l < 2 * m + 2; l++)
2440 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2441 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2444 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2446 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2456 MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2460 #pragma omp parallel for default(shared) private(k)
2462 for (k = 0; k < M; k++)
2465 R psij_const[2 * m + 2];
2466 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2468 uo(ths, j, &u, &o, (INT)0);
2470 for (l = 0; l <= 2 * m + 1; l++)
2471 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2474 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2476 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2481 void X(trafo_1d)(
X(plan) *ths)
2483 const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2484 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2486 ths->g_hat = ths->g1;
2490 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2491 R *c_phi_inv1, *c_phi_inv2;
2497 #pragma omp parallel for default(shared) private(k)
2498 for (k = 0; k < ths->n_total; k++)
2499 ths->g_hat[k] = 0.0;
2502 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
2504 if(ths->flags & PRE_PHI_HUT)
2507 c_phi_inv1 = ths->c_phi_inv[0];
2508 c_phi_inv2 = &ths->c_phi_inv[0][N2];
2511 #pragma omp parallel for default(shared) private(k)
2513 for (k = 0; k < N2; k++)
2515 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2516 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2523 #pragma omp parallel for default(shared) private(k)
2525 for (k = 0; k < N2; k++)
2527 g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2528 g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2534 FFTW(execute)(ths->my_fftw_plan1);
2538 nfft_trafo_1d_B(ths);
2543 void X(adjoint_1d)(
X(plan) *ths)
2546 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2547 R *c_phi_inv1, *c_phi_inv2;
2555 f_hat1=(C*)ths->f_hat;
2556 f_hat2=(C*)&ths->f_hat[N/2];
2557 g_hat1=(C*)&ths->g_hat[n-N/2];
2558 g_hat2=(C*)ths->g_hat;
2561 nfft_adjoint_1d_B(ths);
2565 FFTW(execute)(ths->my_fftw_plan2);
2569 if(ths->flags & PRE_PHI_HUT)
2572 c_phi_inv1=ths->c_phi_inv[0];
2573 c_phi_inv2=&ths->c_phi_inv[0][N/2];
2576 #pragma omp parallel for default(shared) private(k)
2578 for (k = 0; k < N/2; k++)
2580 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2581 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2589 #pragma omp parallel for default(shared) private(k)
2591 for (k = 0; k < N/2; k++)
2593 f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2594 f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2603 static void nfft_2d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
2606 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2608 fg_exp_b0 = EXP(K(-1.0)/b);
2609 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2612 fg_exp_l[0] = K(1.0);
2613 for(l=1; l <= 2*m+1; l++)
2615 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2616 fg_exp_b1 *= fg_exp_b0_sq;
2617 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2621 static void nfft_trafo_2d_compute(C *fj,
const C *g,
const R *psij_const0,
2622 const R *psij_const1,
const R *xj0,
const R *xj1,
const INT n0,
2623 const INT n1,
const INT m)
2625 INT u0,o0,l0,u1,o1,l1;
2627 const R *psij0,*psij1;
2632 uo2(&u0,&o0,*xj0, n0, m);
2633 uo2(&u1,&o1,*xj1, n1, m);
2639 for(l0=0; l0<=2*m+1; l0++,psij0++)
2643 for(l1=0; l1<=2*m+1; l1++)
2644 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2647 for(l0=0; l0<=2*m+1; l0++,psij0++)
2651 for(l1=0; l1<2*m+1-o1; l1++)
2652 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2654 for(l1=0; l1<=o1; l1++)
2655 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2660 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2664 for(l1=0; l1<=2*m+1; l1++)
2665 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2667 for(l0=0; l0<=o0; l0++,psij0++)
2671 for(l1=0; l1<=2*m+1; l1++)
2672 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2677 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2681 for(l1=0; l1<2*m+1-o1; l1++)
2682 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2684 for(l1=0; l1<=o1; l1++)
2685 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2687 for(l0=0; l0<=o0; l0++,psij0++)
2691 for(l1=0; l1<2*m+1-o1; l1++)
2692 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2694 for(l1=0; l1<=o1; l1++)
2695 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2702 static void nfft_adjoint_2d_compute_omp_atomic(
const C f, C *g,
2703 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2704 const R *xj1,
const INT n0,
const INT n1,
const INT m)
2706 INT u0,o0,l0,u1,o1,l1;
2707 const INT lprod = (2*m+2) * (2*m+2);
2709 INT index_temp0[2*m+2];
2710 INT index_temp1[2*m+2];
2712 uo2(&u0,&o0,*xj0, n0, m);
2713 uo2(&u1,&o1,*xj1, n1, m);
2715 for (l0=0; l0<=2*m+1; l0++)
2716 index_temp0[l0] = (u0+l0)%n0;
2718 for (l1=0; l1<=2*m+1; l1++)
2719 index_temp1[l1] = (u1+l1)%n1;
2721 for(l0=0; l0<=2*m+1; l0++)
2723 for(l1=0; l1<=2*m+1; l1++)
2725 INT i = index_temp0[l0] * n1 + index_temp1[l1];
2727 R *lhs_real = (R*)lhs;
2728 C val = psij_const0[l0] * psij_const1[l1] * f;
2731 lhs_real[0] += creal(val);
2734 lhs_real[1] += cimag(val);
2759 static void nfft_adjoint_2d_compute_omp_blockwise(
const C f, C *g,
2760 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2761 const R *xj1,
const INT n0,
const INT n1,
const INT m,
2762 const INT my_u0,
const INT my_o0)
2764 INT ar_u0,ar_o0,l0,u1,o1,l1;
2765 const INT lprod = (2*m+2) * (2*m+2);
2766 INT index_temp1[2*m+2];
2768 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2769 uo2(&u1,&o1,*xj1, n1, m);
2771 for (l1 = 0; l1 <= 2*m+1; l1++)
2772 index_temp1[l1] = (u1+l1)%n1;
2776 INT u0 = MAX(my_u0,ar_u0);
2777 INT o0 = MIN(my_o0,ar_o0);
2778 INT offset_psij = u0-ar_u0;
2780 assert(offset_psij >= 0);
2781 assert(o0-u0 <= 2*m+1);
2782 assert(offset_psij+o0-u0 <= 2*m+1);
2785 for (l0 = 0; l0 <= o0-u0; l0++)
2787 INT i0 = (u0+l0) * n1;
2788 const C val0 = psij_const0[offset_psij+l0];
2790 for(l1=0; l1<=2*m+1; l1++)
2791 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2796 INT u0 = MAX(my_u0,ar_u0);
2798 INT offset_psij = u0-ar_u0;
2800 assert(offset_psij >= 0);
2801 assert(o0-u0 <= 2*m+1);
2802 assert(offset_psij+o0-u0 <= 2*m+1);
2805 for (l0 = 0; l0 <= o0-u0; l0++)
2807 INT i0 = (u0+l0) * n1;
2808 const C val0 = psij_const0[offset_psij+l0];
2810 for(l1=0; l1<=2*m+1; l1++)
2811 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2815 o0 = MIN(my_o0,ar_o0);
2816 offset_psij += my_u0-ar_u0+n0;
2821 assert(o0-u0 <= 2*m+1);
2822 assert(offset_psij+o0-u0 <= 2*m+1);
2826 for (l0 = 0; l0 <= o0-u0; l0++)
2828 INT i0 = (u0+l0) * n1;
2829 const C val0 = psij_const0[offset_psij+l0];
2831 for(l1=0; l1<=2*m+1; l1++)
2832 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2839 static void nfft_adjoint_2d_compute_serial(
const C *fj, C *g,
2840 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2841 const R *xj1,
const INT n0,
const INT n1,
const INT m)
2843 INT u0,o0,l0,u1,o1,l1;
2845 const R *psij0,*psij1;
2850 uo2(&u0,&o0,*xj0, n0, m);
2851 uo2(&u1,&o1,*xj1, n1, m);
2855 for(l0=0; l0<=2*m+1; l0++,psij0++)
2859 for(l1=0; l1<=2*m+1; l1++)
2860 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2863 for(l0=0; l0<=2*m+1; l0++,psij0++)
2867 for(l1=0; l1<2*m+1-o1; l1++)
2868 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2870 for(l1=0; l1<=o1; l1++)
2871 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2876 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2880 for(l1=0; l1<=2*m+1; l1++)
2881 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2883 for(l0=0; l0<=o0; l0++,psij0++)
2887 for(l1=0; l1<=2*m+1; l1++)
2888 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2893 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2897 for(l1=0; l1<2*m+1-o1; l1++)
2898 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2900 for(l1=0; l1<=o1; l1++)
2901 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2903 for(l0=0; l0<=o0; l0++,psij0++)
2907 for(l1=0; l1<2*m+1-o1; l1++)
2908 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2910 for(l1=0; l1<=o1; l1++)
2911 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2917 static void nfft_trafo_2d_B(
X(plan) *ths)
2919 const C *g = (C*)ths->g;
2920 const INT n0 = ths->n[0];
2921 const INT n1 = ths->n[1];
2922 const INT M = ths->M_total;
2923 const INT m = ths->m;
2927 if(ths->flags & PRE_FULL_PSI)
2929 const INT lprod = (2*m+2) * (2*m+2);
2931 #pragma omp parallel for default(shared) private(k)
2933 for (k = 0; k < M; k++)
2936 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2938 for (l = 0; l < lprod; l++)
2939 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2944 if(ths->flags & PRE_PSI)
2947 #pragma omp parallel for default(shared) private(k)
2949 for (k = 0; k < M; k++)
2951 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2952 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2958 if(ths->flags & PRE_FG_PSI)
2960 R fg_exp_l[2*(2*m+2)];
2962 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2963 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2966 #pragma omp parallel for default(shared) private(k)
2968 for (k = 0; k < M; k++)
2970 R psij_const[2*(2*m+2)];
2971 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2973 R fg_psij0 = ths->psi[2*j*2];
2974 R fg_psij1 = ths->psi[2*j*2+1];
2975 R fg_psij2 = K(1.0);
2977 psij_const[0] = fg_psij0;
2978 for (l = 1; l <= 2*m+1; l++)
2980 fg_psij2 *= fg_psij1;
2981 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
2984 fg_psij0 = ths->psi[2*(j*2+1)];
2985 fg_psij1 = ths->psi[2*(j*2+1)+1];
2987 psij_const[2*m+2] = fg_psij0;
2988 for (l = 1; l <= 2*m+1; l++)
2990 fg_psij2 *= fg_psij1;
2991 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
2994 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3000 if(ths->flags & FG_PSI)
3002 R fg_exp_l[2*(2*m+2)];
3004 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3005 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3010 #pragma omp parallel for default(shared) private(k)
3012 for (k = 0; k < M; k++)
3015 R fg_psij0, fg_psij1, fg_psij2;
3016 R psij_const[2*(2*m+2)];
3017 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3019 uo(ths, j, &u, &o, (INT)0);
3020 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3021 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3023 psij_const[0] = fg_psij0;
3024 for (l = 1; l <= 2*m+1; l++)
3026 fg_psij2 *= fg_psij1;
3027 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3030 uo(ths,j,&u,&o, (INT)1);
3031 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3032 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3034 psij_const[2*m+2] = fg_psij0;
3035 for(l=1; l<=2*m+1; l++)
3037 fg_psij2 *= fg_psij1;
3038 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3041 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3047 if(ths->flags & PRE_LIN_PSI)
3049 const INT K = ths->K, ip_s = K / (m + 2);
3054 #pragma omp parallel for default(shared) private(k)
3056 for (k = 0; k < M; k++)
3061 R psij_const[2*(2*m+2)];
3062 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3064 uo(ths,j,&u,&o,(INT)0);
3065 ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3066 ip_u = (INT)LRINT(FLOOR(ip_y));
3067 ip_w = ip_y - (R)(ip_u);
3068 for (l = 0; l < 2*m+2; l++)
3069 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3071 uo(ths,j,&u,&o,(INT)1);
3072 ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3073 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3074 ip_w = ip_y - (R)(ip_u);
3075 for (l = 0; l < 2*m+2; l++)
3076 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3078 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3088 #pragma omp parallel for default(shared) private(k)
3090 for (k = 0; k < M; k++)
3092 R psij_const[2*(2*m+2)];
3094 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3096 uo(ths,j,&u,&o,(INT)0);
3097 for (l = 0; l <= 2*m+1; l++)
3098 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3100 uo(ths,j,&u,&o,(INT)1);
3101 for (l = 0; l <= 2*m+1; l++)
3102 psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3104 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3109 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3111 assert(ar_x[2*k] >= min_u_a || k == M-1); \
3113 assert(ar_x[2*k-2] < min_u_a); \
3116 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3120 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3122 assert(ar_x[2*k] >= min_u_b || k == M-1); \
3124 assert(ar_x[2*k-2] < min_u_b); \
3127 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3130 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3131 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3132 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3133 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3135 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3137 R psij_const[2*(2*m+2)]; \
3139 R fg_psij0 = ths->psi[2*j*2]; \
3140 R fg_psij1 = ths->psi[2*j*2+1]; \
3141 R fg_psij2 = K(1.0); \
3143 psij_const[0] = fg_psij0; \
3144 for(l=1; l<=2*m+1; l++) \
3146 fg_psij2 *= fg_psij1; \
3147 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3150 fg_psij0 = ths->psi[2*(j*2+1)]; \
3151 fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3152 fg_psij2 = K(1.0); \
3153 psij_const[2*m+2] = fg_psij0; \
3154 for(l=1; l<=2*m+1; l++) \
3156 fg_psij2 *= fg_psij1; \
3157 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3160 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3161 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3162 n0, n1, m, my_u0, my_o0); \
3165 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3167 R psij_const[2*(2*m+2)]; \
3168 R fg_psij0, fg_psij1, fg_psij2; \
3171 uo(ths,j,&u,&o,(INT)0); \
3172 fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
3173 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3174 fg_psij2 = K(1.0); \
3175 psij_const[0] = fg_psij0; \
3176 for(l=1; l<=2*m+1; l++) \
3178 fg_psij2 *= fg_psij1; \
3179 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3182 uo(ths,j,&u,&o,(INT)1); \
3183 fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
3184 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3185 fg_psij2 = K(1.0); \
3186 psij_const[2*m+2] = fg_psij0; \
3187 for(l=1; l<=2*m+1; l++) \
3189 fg_psij2 *= fg_psij1; \
3190 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3193 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3194 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3195 n0, n1, m, my_u0, my_o0); \
3198 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3200 R psij_const[2*(2*m+2)]; \
3205 uo(ths,j,&u,&o,(INT)0); \
3206 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3207 ip_u = LRINT(FLOOR(ip_y)); \
3209 for(l=0; l < 2*m+2; l++) \
3210 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3211 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3213 uo(ths,j,&u,&o,(INT)1); \
3214 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3215 ip_u = LRINT(FLOOR(ip_y)); \
3217 for(l=0; l < 2*m+2; l++) \
3218 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3219 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3221 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3222 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3223 n0, n1, m, my_u0, my_o0); \
3226 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3228 R psij_const[2*(2*m+2)]; \
3231 uo(ths,j,&u,&o,(INT)0); \
3232 for(l=0;l<=2*m+1;l++) \
3233 psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
3235 uo(ths,j,&u,&o,(INT)1); \
3236 for(l=0;l<=2*m+1;l++) \
3237 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3239 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3240 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3241 n0, n1, m, my_u0, my_o0); \
3244 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3246 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3248 _Pragma("omp parallel private(k)") \
3250 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3251 INT *ar_x = ths->index_x; \
3253 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3254 &min_u_b, &max_u_b, 2, ths->n, m); \
3256 if (min_u_a != -1) \
3258 k = index_x_binary_search(ar_x, M, min_u_a); \
3260 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3264 INT u_prod = ar_x[2*k]; \
3265 INT j = ar_x[2*k+1]; \
3267 if (u_prod < min_u_a || u_prod > max_u_a) \
3270 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3276 if (min_u_b != -1) \
3278 INT k = index_x_binary_search(ar_x, M, min_u_b); \
3280 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3284 INT u_prod = ar_x[2*k]; \
3285 INT j = ar_x[2*k+1]; \
3287 if (u_prod < min_u_b || u_prod > max_u_b) \
3290 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3301 static void nfft_adjoint_2d_B(
X(plan) *ths)
3303 const INT n0 = ths->n[0];
3304 const INT n1 = ths->n[1];
3305 const INT M = ths->M_total;
3306 const INT m = ths->m;
3310 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
3312 if(ths->flags & PRE_FULL_PSI)
3314 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3315 (INT)2, ths->n, m, ths->flags, ths->index_x);
3319 if(ths->flags & PRE_PSI)
3322 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3326 #pragma omp parallel for default(shared) private(k)
3328 for (k = 0; k < M; k++)
3330 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3332 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3334 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3340 if(ths->flags & PRE_FG_PSI)
3342 R fg_exp_l[2*(2*m+2)];
3344 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3345 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3348 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3353 #pragma omp parallel for default(shared) private(k)
3355 for (k = 0; k < M; k++)
3357 R psij_const[2*(2*m+2)];
3358 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3360 R fg_psij0 = ths->psi[2*j*2];
3361 R fg_psij1 = ths->psi[2*j*2+1];
3362 R fg_psij2 = K(1.0);
3364 psij_const[0] = fg_psij0;
3365 for(l=1; l<=2*m+1; l++)
3367 fg_psij2 *= fg_psij1;
3368 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3371 fg_psij0 = ths->psi[2*(j*2+1)];
3372 fg_psij1 = ths->psi[2*(j*2+1)+1];
3374 psij_const[2*m+2] = fg_psij0;
3375 for(l=1; l<=2*m+1; l++)
3377 fg_psij2 *= fg_psij1;
3378 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3382 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3384 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3391 if(ths->flags & FG_PSI)
3393 R fg_exp_l[2*(2*m+2)];
3395 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3396 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3401 MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3405 #pragma omp parallel for default(shared) private(k)
3407 for (k = 0; k < M; k++)
3410 R fg_psij0, fg_psij1, fg_psij2;
3411 R psij_const[2*(2*m+2)];
3412 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3414 uo(ths,j,&u,&o,(INT)0);
3415 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3416 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3418 psij_const[0] = fg_psij0;
3419 for(l=1; l<=2*m+1; l++)
3421 fg_psij2 *= fg_psij1;
3422 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3425 uo(ths,j,&u,&o,(INT)1);
3426 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3427 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3429 psij_const[2*m+2] = fg_psij0;
3430 for(l=1; l<=2*m+1; l++)
3432 fg_psij2 *= fg_psij1;
3433 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3437 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3439 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3446 if(ths->flags & PRE_LIN_PSI)
3448 const INT K = ths->K;
3449 const INT ip_s = K / (m + 2);
3454 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3458 #pragma omp parallel for default(shared) private(k)
3460 for (k = 0; k < M; k++)
3465 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3466 R psij_const[2*(2*m+2)];
3468 uo(ths,j,&u,&o,(INT)0);
3469 ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3470 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3471 ip_w = ip_y - (R)(ip_u);
3472 for(l=0; l < 2*m+2; l++)
3473 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3474 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3476 uo(ths,j,&u,&o,(INT)1);
3477 ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3478 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3479 ip_w = ip_y - (R)(ip_u);
3480 for(l=0; l < 2*m+2; l++)
3481 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3482 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3485 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3487 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3497 MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3501 #pragma omp parallel for default(shared) private(k)
3503 for (k = 0; k < M; k++)
3506 R psij_const[2*(2*m+2)];
3507 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3509 uo(ths,j,&u,&o,(INT)0);
3510 for(l=0;l<=2*m+1;l++)
3511 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3513 uo(ths,j,&u,&o,(INT)1);
3514 for(l=0;l<=2*m+1;l++)
3515 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3518 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3520 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3526 void X(trafo_2d)(
X(plan) *ths)
3528 INT k0,k1,n0,n1,N0,N1;
3530 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3531 R ck01, ck02, ck11, ck12;
3532 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3542 f_hat=(C*)ths->f_hat;
3543 g_hat=(C*)ths->g_hat;
3547 #pragma omp parallel for default(shared) private(k0)
3548 for (k0 = 0; k0 < ths->n_total; k0++)
3549 ths->g_hat[k0] = 0.0;
3551 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
3553 if(ths->flags & PRE_PHI_HUT)
3555 c_phi_inv01=ths->c_phi_inv[0];
3556 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3559 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3561 for(k0=0;k0<N0/2;k0++)
3563 ck01=c_phi_inv01[k0];
3564 ck02=c_phi_inv02[k0];
3566 c_phi_inv11=ths->c_phi_inv[1];
3567 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3569 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3570 f_hat11=f_hat + k0*N1;
3571 g_hat21=g_hat + k0*n1+n1-(N1/2);
3572 f_hat21=f_hat + ((N0/2)+k0)*N1;
3573 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3574 f_hat12=f_hat + k0*N1+(N1/2);
3575 g_hat22=g_hat + k0*n1;
3576 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3578 for(k1=0;k1<N1/2;k1++)
3580 ck11=c_phi_inv11[k1];
3581 ck12=c_phi_inv12[k1];
3583 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3584 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3585 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3586 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3592 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3594 for(k0=0;k0<N0/2;k0++)
3596 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3597 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3598 for(k1=0;k1<N1/2;k1++)
3600 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3601 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3602 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3603 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3604 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3605 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3612 FFTW(execute)(ths->my_fftw_plan1);
3616 nfft_trafo_2d_B(ths);
3620 void X(adjoint_2d)(
X(plan) *ths)
3622 INT k0,k1,n0,n1,N0,N1;
3624 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3625 R ck01, ck02, ck11, ck12;
3626 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3636 f_hat=(C*)ths->f_hat;
3637 g_hat=(C*)ths->g_hat;
3640 nfft_adjoint_2d_B(ths);
3644 FFTW(execute)(ths->my_fftw_plan2);
3648 if(ths->flags & PRE_PHI_HUT)
3650 c_phi_inv01=ths->c_phi_inv[0];
3651 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3654 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3656 for(k0=0;k0<N0/2;k0++)
3658 ck01=c_phi_inv01[k0];
3659 ck02=c_phi_inv02[k0];
3661 c_phi_inv11=ths->c_phi_inv[1];
3662 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3664 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3665 f_hat11=f_hat + k0*N1;
3666 g_hat21=g_hat + k0*n1+n1-(N1/2);
3667 f_hat21=f_hat + ((N0/2)+k0)*N1;
3668 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3669 f_hat12=f_hat + k0*N1+(N1/2);
3670 g_hat22=g_hat + k0*n1;
3671 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3673 for(k1=0;k1<N1/2;k1++)
3675 ck11=c_phi_inv11[k1];
3676 ck12=c_phi_inv12[k1];
3678 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3679 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3680 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3681 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3687 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3689 for(k0=0;k0<N0/2;k0++)
3691 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3692 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3693 for(k1=0;k1<N1/2;k1++)
3695 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3696 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3697 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3698 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3699 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3700 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3708 static void nfft_3d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
3711 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3713 fg_exp_b0 = EXP(-K(1.0) / b);
3714 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3717 fg_exp_l[0] = K(1.0);
3718 for(l=1; l <= 2*m+1; l++)
3720 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3721 fg_exp_b1 *= fg_exp_b0_sq;
3722 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3726 static void nfft_trafo_3d_compute(C *fj,
const C *g,
const R *psij_const0,
3727 const R *psij_const1,
const R *psij_const2,
const R *xj0,
const R *xj1,
3728 const R *xj2,
const INT n0,
const INT n1,
const INT n2,
const INT m)
3730 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
3732 const R *psij0, *psij1, *psij2;
3734 psij0 = psij_const0;
3735 psij1 = psij_const1;
3736 psij2 = psij_const2;
3738 uo2(&u0, &o0, *xj0, n0, m);
3739 uo2(&u1, &o1, *xj1, n1, m);
3740 uo2(&u2, &o2, *xj2, n2, m);
3747 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3749 psij1 = psij_const1;
3750 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3752 psij2 = psij_const2;
3753 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3754 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3755 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3760 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3762 psij1 = psij_const1;
3763 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3765 psij2 = psij_const2;
3766 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3767 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3768 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3769 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3770 for (l2 = 0; l2 <= o2; l2++)
3771 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3776 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3778 psij1 = psij_const1;
3779 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3781 psij2 = psij_const2;
3782 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3783 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3784 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3786 for (l1 = 0; l1 <= o1; l1++, psij1++)
3788 psij2 = psij_const2;
3789 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3790 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3791 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3796 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3798 psij1 = psij_const1;
3799 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3801 psij2 = psij_const2;
3802 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3803 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3804 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3805 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3806 for (l2 = 0; l2 <= o2; l2++)
3807 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3809 for (l1 = 0; l1 <= o1; l1++, psij1++)
3811 psij2 = psij_const2;
3812 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3813 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3814 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3815 gj = g + ((u0 + l0) * n1 + l1) * n2;
3816 for (l2 = 0; l2 <= o2; l2++)
3817 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3825 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3827 psij1 = psij_const1;
3828 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3830 psij2 = psij_const2;
3831 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3832 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3833 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3837 for (l0 = 0; l0 <= o0; l0++, psij0++)
3839 psij1 = psij_const1;
3840 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3842 psij2 = psij_const2;
3843 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3844 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3845 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3850 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3852 psij1 = psij_const1;
3853 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3855 psij2 = psij_const2;
3856 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3857 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3858 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3859 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3860 for (l2 = 0; l2 <= o2; l2++)
3861 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3865 for (l0 = 0; l0 <= o0; l0++, psij0++)
3867 psij1 = psij_const1;
3868 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3870 psij2 = psij_const2;
3871 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3872 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3873 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3874 gj = g + (l0 * n1 + (u1 + l1)) * n2;
3875 for (l2 = 0; l2 <= o2; l2++)
3876 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3883 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3885 psij1 = psij_const1;
3886 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3888 psij2 = psij_const2;
3889 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3890 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3891 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3893 for (l1 = 0; l1 <= o1; l1++, psij1++)
3895 psij2 = psij_const2;
3896 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3897 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3898 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3901 for (l0 = 0; l0 <= o0; l0++, psij0++)
3903 psij1 = psij_const1;
3904 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3906 psij2 = psij_const2;
3907 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3908 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3909 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3911 for (l1 = 0; l1 <= o1; l1++, psij1++)
3913 psij2 = psij_const2;
3914 gj = g + (l0 * n1 + l1) * n2 + u2;
3915 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3916 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3921 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3923 psij1 = psij_const1;
3924 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3926 psij2 = psij_const2;
3927 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3928 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3929 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3930 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3931 for (l2 = 0; l2 <= o2; l2++)
3932 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3934 for (l1 = 0; l1 <= o1; l1++, psij1++)
3936 psij2 = psij_const2;
3937 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3938 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3939 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3940 gj = g + ((u0 + l0) * n1 + l1) * n2;
3941 for (l2 = 0; l2 <= o2; l2++)
3942 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3946 for (l0 = 0; l0 <= o0; l0++, psij0++)
3948 psij1 = psij_const1;
3949 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3951 psij2 = psij_const2;
3952 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3953 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3954 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3955 gj = g + (l0 * n1 + (u1 + l1)) * n2;
3956 for (l2 = 0; l2 <= o2; l2++)
3957 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3959 for (l1 = 0; l1 <= o1; l1++, psij1++)
3961 psij2 = psij_const2;
3962 gj = g + (l0 * n1 + l1) * n2 + u2;
3963 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3964 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3965 gj = g + (l0 * n1 + l1) * n2;
3966 for (l2 = 0; l2 <= o2; l2++)
3967 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3995 static void nfft_adjoint_3d_compute_omp_blockwise(
const C f, C *g,
3996 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
3997 const R *xj0,
const R *xj1,
const R *xj2,
3998 const INT n0,
const INT n1,
const INT n2,
const INT m,
3999 const INT my_u0,
const INT my_o0)
4001 INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4002 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4004 INT index_temp1[2*m+2];
4005 INT index_temp2[2*m+2];
4007 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4008 uo2(&u1,&o1,*xj1, n1, m);
4009 uo2(&u2,&o2,*xj2, n2, m);
4011 for (l1=0; l1<=2*m+1; l1++)
4012 index_temp1[l1] = (u1+l1)%n1;
4014 for (l2=0; l2<=2*m+1; l2++)
4015 index_temp2[l2] = (u2+l2)%n2;
4019 INT u0 = MAX(my_u0,ar_u0);
4020 INT o0 = MIN(my_o0,ar_o0);
4021 INT offset_psij = u0-ar_u0;
4023 assert(offset_psij >= 0);
4024 assert(o0-u0 <= 2*m+1);
4025 assert(offset_psij+o0-u0 <= 2*m+1);
4028 for (l0 = 0; l0 <= o0-u0; l0++)
4030 const INT i0 = (u0+l0) * n1;
4031 const C val0 = psij_const0[offset_psij+l0];
4033 for(l1=0; l1<=2*m+1; l1++)
4035 const INT i1 = (i0 + index_temp1[l1]) * n2;
4036 const C val1 = psij_const1[l1];
4038 for(l2=0; l2<=2*m+1; l2++)
4039 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4045 INT u0 = MAX(my_u0,ar_u0);
4047 INT offset_psij = u0-ar_u0;
4049 assert(offset_psij >= 0);
4050 assert(o0-u0 <= 2*m+1);
4051 assert(offset_psij+o0-u0 <= 2*m+1);
4054 for (l0 = 0; l0 <= o0-u0; l0++)
4056 INT i0 = (u0+l0) * n1;
4057 const C val0 = psij_const0[offset_psij+l0];
4059 for(l1=0; l1<=2*m+1; l1++)
4061 const INT i1 = (i0 + index_temp1[l1]) * n2;
4062 const C val1 = psij_const1[l1];
4064 for(l2=0; l2<=2*m+1; l2++)
4065 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4070 o0 = MIN(my_o0,ar_o0);
4071 offset_psij += my_u0-ar_u0+n0;
4076 assert(o0-u0 <= 2*m+1);
4077 assert(offset_psij+o0-u0 <= 2*m+1);
4080 for (l0 = 0; l0 <= o0-u0; l0++)
4082 INT i0 = (u0+l0) * n1;
4083 const C val0 = psij_const0[offset_psij+l0];
4085 for(l1=0; l1<=2*m+1; l1++)
4087 const INT i1 = (i0 + index_temp1[l1]) * n2;
4088 const C val1 = psij_const1[l1];
4090 for(l2=0; l2<=2*m+1; l2++)
4091 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4100 static void nfft_adjoint_3d_compute_omp_atomic(
const C f, C *g,
4101 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4102 const R *xj0,
const R *xj1,
const R *xj2,
4103 const INT n0,
const INT n1,
const INT n2,
const INT m)
4105 INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4106 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4108 INT index_temp0[2*m+2];
4109 INT index_temp1[2*m+2];
4110 INT index_temp2[2*m+2];
4112 uo2(&u0,&o0,*xj0, n0, m);
4113 uo2(&u1,&o1,*xj1, n1, m);
4114 uo2(&u2,&o2,*xj2, n2, m);
4116 for (l0=0; l0<=2*m+1; l0++)
4117 index_temp0[l0] = (u0+l0)%n0;
4119 for (l1=0; l1<=2*m+1; l1++)
4120 index_temp1[l1] = (u1+l1)%n1;
4122 for (l2=0; l2<=2*m+1; l2++)
4123 index_temp2[l2] = (u2+l2)%n2;
4125 for(l0=0; l0<=2*m+1; l0++)
4127 for(l1=0; l1<=2*m+1; l1++)
4129 for(l2=0; l2<=2*m+1; l2++)
4131 INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4133 R *lhs_real = (R*)lhs;
4134 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4137 lhs_real[0] += creal(val);
4140 lhs_real[1] += cimag(val);
4148 static void nfft_adjoint_3d_compute_serial(
const C *fj, C *g,
4149 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
const R *xj0,
4150 const R *xj1,
const R *xj2,
const INT n0,
const INT n1,
const INT n2,
4153 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4155 const R *psij0, *psij1, *psij2;
4157 psij0 = psij_const0;
4158 psij1 = psij_const1;
4159 psij2 = psij_const2;
4161 uo2(&u0, &o0, *xj0, n0, m);
4162 uo2(&u1, &o1, *xj1, n1, m);
4163 uo2(&u2, &o2, *xj2, n2, m);
4168 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4170 psij1 = psij_const1;
4171 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4173 psij2 = psij_const2;
4174 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4175 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4176 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4181 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4183 psij1 = psij_const1;
4184 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4186 psij2 = psij_const2;
4187 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4188 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4189 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4190 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4191 for (l2 = 0; l2 <= o2; l2++)
4192 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4197 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4199 psij1 = psij_const1;
4200 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4202 psij2 = psij_const2;
4203 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4204 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4205 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4207 for (l1 = 0; l1 <= o1; l1++, psij1++)
4209 psij2 = psij_const2;
4210 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4211 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4212 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4217 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4219 psij1 = psij_const1;
4220 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4222 psij2 = psij_const2;
4223 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4224 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4225 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4226 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4227 for (l2 = 0; l2 <= o2; l2++)
4228 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4230 for (l1 = 0; l1 <= o1; l1++, psij1++)
4232 psij2 = psij_const2;
4233 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4234 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4235 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4236 gj = g + ((u0 + l0) * n1 + l1) * n2;
4237 for (l2 = 0; l2 <= o2; l2++)
4238 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4246 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4248 psij1 = psij_const1;
4249 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4251 psij2 = psij_const2;
4252 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4253 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4254 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4258 for (l0 = 0; l0 <= o0; l0++, psij0++)
4260 psij1 = psij_const1;
4261 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4263 psij2 = psij_const2;
4264 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4265 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4266 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4271 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4273 psij1 = psij_const1;
4274 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4276 psij2 = psij_const2;
4277 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4278 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4279 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4280 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4281 for (l2 = 0; l2 <= o2; l2++)
4282 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4286 for (l0 = 0; l0 <= o0; l0++, psij0++)
4288 psij1 = psij_const1;
4289 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4291 psij2 = psij_const2;
4292 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4293 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4294 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4295 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4296 for (l2 = 0; l2 <= o2; l2++)
4297 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4304 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4306 psij1 = psij_const1;
4307 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4309 psij2 = psij_const2;
4310 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4311 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4312 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4314 for (l1 = 0; l1 <= o1; l1++, psij1++)
4316 psij2 = psij_const2;
4317 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4318 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4319 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4322 for (l0 = 0; l0 <= o0; l0++, psij0++)
4324 psij1 = psij_const1;
4325 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4327 psij2 = psij_const2;
4328 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4329 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4330 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4332 for (l1 = 0; l1 <= o1; l1++, psij1++)
4334 psij2 = psij_const2;
4335 gj = g + (l0 * n1 + l1) * n2 + u2;
4336 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4337 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4342 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4344 psij1 = psij_const1;
4345 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4347 psij2 = psij_const2;
4348 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4349 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4350 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4351 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4352 for (l2 = 0; l2 <= o2; l2++)
4353 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4355 for (l1 = 0; l1 <= o1; l1++, psij1++)
4357 psij2 = psij_const2;
4358 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4359 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4360 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4361 gj = g + ((u0 + l0) * n1 + l1) * n2;
4362 for (l2 = 0; l2 <= o2; l2++)
4363 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4367 for (l0 = 0; l0 <= o0; l0++, psij0++)
4369 psij1 = psij_const1;
4370 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4372 psij2 = psij_const2;
4373 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4374 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4375 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4376 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4377 for (l2 = 0; l2 <= o2; l2++)
4378 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4380 for (l1 = 0; l1 <= o1; l1++, psij1++)
4382 psij2 = psij_const2;
4383 gj = g + (l0 * n1 + l1) * n2 + u2;
4384 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4385 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4386 gj = g + (l0 * n1 + l1) * n2;
4387 for (l2 = 0; l2 <= o2; l2++)
4388 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4395 static void nfft_trafo_3d_B(
X(plan) *ths)
4397 const INT n0 = ths->n[0];
4398 const INT n1 = ths->n[1];
4399 const INT n2 = ths->n[2];
4400 const INT M = ths->M_total;
4401 const INT m = ths->m;
4403 const C* g = (C*) ths->g;
4407 if(ths->flags & PRE_FULL_PSI)
4409 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4411 #pragma omp parallel for default(shared) private(k)
4413 for (k = 0; k < M; k++)
4416 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4418 for (l = 0; l < lprod; l++)
4419 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4424 if(ths->flags & PRE_PSI)
4427 #pragma omp parallel for default(shared) private(k)
4429 for (k = 0; k < M; k++)
4431 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4432 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4437 if(ths->flags & PRE_FG_PSI)
4439 R fg_exp_l[3*(2*m+2)];
4441 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4442 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4443 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4446 #pragma omp parallel for default(shared) private(k)
4448 for (k = 0; k < M; k++)
4450 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4452 R psij_const[3*(2*m+2)];
4453 R fg_psij0 = ths->psi[2*j*3];
4454 R fg_psij1 = ths->psi[2*j*3+1];
4455 R fg_psij2 = K(1.0);
4457 psij_const[0] = fg_psij0;
4458 for(l=1; l<=2*m+1; l++)
4460 fg_psij2 *= fg_psij1;
4461 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4464 fg_psij0 = ths->psi[2*(j*3+1)];
4465 fg_psij1 = ths->psi[2*(j*3+1)+1];
4467 psij_const[2*m+2] = fg_psij0;
4468 for(l=1; l<=2*m+1; l++)
4470 fg_psij2 *= fg_psij1;
4471 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4474 fg_psij0 = ths->psi[2*(j*3+2)];
4475 fg_psij1 = ths->psi[2*(j*3+2)+1];
4477 psij_const[2*(2*m+2)] = fg_psij0;
4478 for(l=1; l<=2*m+1; l++)
4480 fg_psij2 *= fg_psij1;
4481 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4484 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4490 if(ths->flags & FG_PSI)
4492 R fg_exp_l[3*(2*m+2)];
4494 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4495 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4496 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4501 #pragma omp parallel for default(shared) private(k)
4503 for (k = 0; k < M; k++)
4505 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4507 R psij_const[3*(2*m+2)];
4508 R fg_psij0, fg_psij1, fg_psij2;
4510 uo(ths,j,&u,&o,(INT)0);
4511 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4512 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4514 psij_const[0] = fg_psij0;
4515 for(l=1; l<=2*m+1; l++)
4517 fg_psij2 *= fg_psij1;
4518 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4521 uo(ths,j,&u,&o,(INT)1);
4522 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4523 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4525 psij_const[2*m+2] = fg_psij0;
4526 for(l=1; l<=2*m+1; l++)
4528 fg_psij2 *= fg_psij1;
4529 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4532 uo(ths,j,&u,&o,(INT)2);
4533 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4534 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4536 psij_const[2*(2*m+2)] = fg_psij0;
4537 for(l=1; l<=2*m+1; l++)
4539 fg_psij2 *= fg_psij1;
4540 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4543 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4549 if(ths->flags & PRE_LIN_PSI)
4551 const INT K = ths->K, ip_s = K / (m + 2);
4556 #pragma omp parallel for default(shared) private(k)
4558 for (k = 0; k < M; k++)
4563 R psij_const[3*(2*m+2)];
4564 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4566 uo(ths,j,&u,&o,(INT)0);
4567 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4568 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4569 ip_w = ip_y - (R)(ip_u);
4570 for(l=0; l < 2*m+2; l++)
4571 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4572 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4574 uo(ths,j,&u,&o,(INT)1);
4575 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4576 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4577 ip_w = ip_y - (R)(ip_u);
4578 for(l=0; l < 2*m+2; l++)
4579 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4580 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4582 uo(ths,j,&u,&o,(INT)2);
4583 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4584 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4585 ip_w = ip_y - (R)(ip_u);
4586 for(l=0; l < 2*m+2; l++)
4587 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4588 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4590 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4600 #pragma omp parallel for default(shared) private(k)
4602 for (k = 0; k < M; k++)
4604 R psij_const[3*(2*m+2)];
4606 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4608 uo(ths,j,&u,&o,(INT)0);
4609 for(l=0;l<=2*m+1;l++)
4610 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4612 uo(ths,j,&u,&o,(INT)1);
4613 for(l=0;l<=2*m+1;l++)
4614 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4616 uo(ths,j,&u,&o,(INT)2);
4617 for(l=0;l<=2*m+1;l++)
4618 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4620 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4625 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4627 assert(ar_x[2*k] >= min_u_a || k == M-1); \
4629 assert(ar_x[2*k-2] < min_u_a); \
4632 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4636 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4638 assert(ar_x[2*k] >= min_u_b || k == M-1); \
4640 assert(ar_x[2*k-2] < min_u_b); \
4643 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4646 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4647 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4648 ths->psi+j*3*(2*m+2), \
4649 ths->psi+(j*3+1)*(2*m+2), \
4650 ths->psi+(j*3+2)*(2*m+2), \
4651 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4652 n0, n1, n2, m, my_u0, my_o0);
4654 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4657 R psij_const[3*(2*m+2)]; \
4658 R fg_psij0 = ths->psi[2*j*3]; \
4659 R fg_psij1 = ths->psi[2*j*3+1]; \
4660 R fg_psij2 = K(1.0); \
4662 psij_const[0] = fg_psij0; \
4663 for(l=1; l<=2*m+1; l++) \
4665 fg_psij2 *= fg_psij1; \
4666 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4669 fg_psij0 = ths->psi[2*(j*3+1)]; \
4670 fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4671 fg_psij2 = K(1.0); \
4672 psij_const[2*m+2] = fg_psij0; \
4673 for(l=1; l<=2*m+1; l++) \
4675 fg_psij2 *= fg_psij1; \
4676 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4679 fg_psij0 = ths->psi[2*(j*3+2)]; \
4680 fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4681 fg_psij2 = K(1.0); \
4682 psij_const[2*(2*m+2)] = fg_psij0; \
4683 for(l=1; l<=2*m+1; l++) \
4685 fg_psij2 *= fg_psij1; \
4686 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4689 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4690 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4691 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4692 n0, n1, n2, m, my_u0, my_o0); \
4695 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4698 R psij_const[3*(2*m+2)]; \
4699 R fg_psij0, fg_psij1, fg_psij2; \
4701 uo(ths,j,&u,&o,(INT)0); \
4702 fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
4703 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4704 fg_psij2 = K(1.0); \
4705 psij_const[0] = fg_psij0; \
4706 for(l=1; l<=2*m+1; l++) \
4708 fg_psij2 *= fg_psij1; \
4709 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4712 uo(ths,j,&u,&o,(INT)1); \
4713 fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
4714 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4715 fg_psij2 = K(1.0); \
4716 psij_const[2*m+2] = fg_psij0; \
4717 for(l=1; l<=2*m+1; l++) \
4719 fg_psij2 *= fg_psij1; \
4720 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4723 uo(ths,j,&u,&o,(INT)2); \
4724 fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
4725 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4726 fg_psij2 = K(1.0); \
4727 psij_const[2*(2*m+2)] = fg_psij0; \
4728 for(l=1; l<=2*m+1; l++) \
4730 fg_psij2 *= fg_psij1; \
4731 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4734 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4735 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4736 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4737 n0, n1, n2, m, my_u0, my_o0); \
4740 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4743 R psij_const[3*(2*m+2)]; \
4747 uo(ths,j,&u,&o,(INT)0); \
4748 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4749 ip_u = LRINT(FLOOR(ip_y)); \
4751 for(l=0; l < 2*m+2; l++) \
4752 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4753 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4755 uo(ths,j,&u,&o,(INT)1); \
4756 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4757 ip_u = LRINT(FLOOR(ip_y)); \
4759 for(l=0; l < 2*m+2; l++) \
4760 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4761 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4763 uo(ths,j,&u,&o,(INT)2); \
4764 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4765 ip_u = LRINT(FLOOR(ip_y)); \
4767 for(l=0; l < 2*m+2; l++) \
4768 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4769 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4771 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4772 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4773 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4774 n0, n1, n2, m, my_u0, my_o0); \
4777 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4780 R psij_const[3*(2*m+2)]; \
4782 uo(ths,j,&u,&o,(INT)0); \
4783 for(l=0;l<=2*m+1;l++) \
4784 psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
4786 uo(ths,j,&u,&o,(INT)1); \
4787 for(l=0;l<=2*m+1;l++) \
4788 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4790 uo(ths,j,&u,&o,(INT)2); \
4791 for(l=0;l<=2*m+1;l++) \
4792 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4794 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4795 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4796 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4797 n0, n1, n2, m, my_u0, my_o0); \
4800 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4802 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4804 _Pragma("omp parallel private(k)") \
4806 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4807 INT *ar_x = ths->index_x; \
4809 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4810 &min_u_b, &max_u_b, 3, ths->n, m); \
4812 if (min_u_a != -1) \
4814 k = index_x_binary_search(ar_x, M, min_u_a); \
4816 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4820 INT u_prod = ar_x[2*k]; \
4821 INT j = ar_x[2*k+1]; \
4823 if (u_prod < min_u_a || u_prod > max_u_a) \
4826 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4832 if (min_u_b != -1) \
4834 INT k = index_x_binary_search(ar_x, M, min_u_b); \
4836 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4840 INT u_prod = ar_x[2*k]; \
4841 INT j = ar_x[2*k+1]; \
4843 if (u_prod < min_u_b || u_prod > max_u_b) \
4846 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4856 static void nfft_adjoint_3d_B(
X(plan) *ths)
4859 const INT n0 = ths->n[0];
4860 const INT n1 = ths->n[1];
4861 const INT n2 = ths->n[2];
4862 const INT M = ths->M_total;
4863 const INT m = ths->m;
4867 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
4869 if(ths->flags & PRE_FULL_PSI)
4871 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4872 (INT)3, ths->n, m, ths->flags, ths->index_x);
4876 if(ths->flags & PRE_PSI)
4879 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4883 #pragma omp parallel for default(shared) private(k)
4885 for (k = 0; k < M; k++)
4887 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4889 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4891 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4897 if(ths->flags & PRE_FG_PSI)
4899 R fg_exp_l[3*(2*m+2)];
4901 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4902 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4903 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4906 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4910 #pragma omp parallel for default(shared) private(k)
4912 for (k = 0; k < M; k++)
4914 R psij_const[3*(2*m+2)];
4915 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4917 R fg_psij0 = ths->psi[2*j*3];
4918 R fg_psij1 = ths->psi[2*j*3+1];
4919 R fg_psij2 = K(1.0);
4921 psij_const[0] = fg_psij0;
4922 for(l=1; l<=2*m+1; l++)
4924 fg_psij2 *= fg_psij1;
4925 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4928 fg_psij0 = ths->psi[2*(j*3+1)];
4929 fg_psij1 = ths->psi[2*(j*3+1)+1];
4931 psij_const[2*m+2] = fg_psij0;
4932 for(l=1; l<=2*m+1; l++)
4934 fg_psij2 *= fg_psij1;
4935 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4938 fg_psij0 = ths->psi[2*(j*3+2)];
4939 fg_psij1 = ths->psi[2*(j*3+2)+1];
4941 psij_const[2*(2*m+2)] = fg_psij0;
4942 for(l=1; l<=2*m+1; l++)
4944 fg_psij2 *= fg_psij1;
4945 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4949 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4951 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4958 if(ths->flags & FG_PSI)
4960 R fg_exp_l[3*(2*m+2)];
4962 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4963 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4964 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4969 MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4973 #pragma omp parallel for default(shared) private(k)
4975 for (k = 0; k < M; k++)
4978 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4979 R psij_const[3*(2*m+2)];
4980 R fg_psij0, fg_psij1, fg_psij2;
4982 uo(ths,j,&u,&o,(INT)0);
4983 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4984 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
4986 psij_const[0] = fg_psij0;
4987 for(l=1; l<=2*m+1; l++)
4989 fg_psij2 *= fg_psij1;
4990 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4993 uo(ths,j,&u,&o,(INT)1);
4994 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4995 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
4997 psij_const[2*m+2] = fg_psij0;
4998 for(l=1; l<=2*m+1; l++)
5000 fg_psij2 *= fg_psij1;
5001 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5004 uo(ths,j,&u,&o,(INT)2);
5005 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5006 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5008 psij_const[2*(2*m+2)] = fg_psij0;
5009 for(l=1; l<=2*m+1; l++)
5011 fg_psij2 *= fg_psij1;
5012 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5016 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5018 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5025 if(ths->flags & PRE_LIN_PSI)
5027 const INT K = ths->K;
5028 const INT ip_s = K / (m + 2);
5033 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5037 #pragma omp parallel for default(shared) private(k)
5039 for (k = 0; k < M; k++)
5044 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5045 R psij_const[3*(2*m+2)];
5047 uo(ths,j,&u,&o,(INT)0);
5048 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5049 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5050 ip_w = ip_y - (R)(ip_u);
5051 for(l=0; l < 2*m+2; l++)
5052 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5053 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5055 uo(ths,j,&u,&o,(INT)1);
5056 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5057 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5058 ip_w = ip_y - (R)(ip_u);
5059 for(l=0; l < 2*m+2; l++)
5060 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5061 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5063 uo(ths,j,&u,&o,(INT)2);
5064 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5065 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5066 ip_w = ip_y - (R)(ip_u);
5067 for(l=0; l < 2*m+2; l++)
5068 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5069 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5072 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5074 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5084 MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5088 #pragma omp parallel for default(shared) private(k)
5090 for (k = 0; k < M; k++)
5093 R psij_const[3*(2*m+2)];
5094 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5096 uo(ths,j,&u,&o,(INT)0);
5097 for(l=0;l<=2*m+1;l++)
5098 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5100 uo(ths,j,&u,&o,(INT)1);
5101 for(l=0;l<=2*m+1;l++)
5102 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5104 uo(ths,j,&u,&o,(INT)2);
5105 for(l=0;l<=2*m+1;l++)
5106 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5109 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5111 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5117 void X(trafo_3d)(
X(plan) *ths)
5119 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5121 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5122 R ck01, ck02, ck11, ck12, ck21, ck22;
5123 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5124 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5136 f_hat=(C*)ths->f_hat;
5137 g_hat=(C*)ths->g_hat;
5141 #pragma omp parallel for default(shared) private(k0)
5142 for (k0 = 0; k0 < ths->n_total; k0++)
5143 ths->g_hat[k0] = 0.0;
5145 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
5148 if(ths->flags & PRE_PHI_HUT)
5150 c_phi_inv01=ths->c_phi_inv[0];
5151 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5154 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5156 for(k0=0;k0<N0/2;k0++)
5158 ck01=c_phi_inv01[k0];
5159 ck02=c_phi_inv02[k0];
5160 c_phi_inv11=ths->c_phi_inv[1];
5161 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5163 for(k1=0;k1<N1/2;k1++)
5165 ck11=c_phi_inv11[k1];
5166 ck12=c_phi_inv12[k1];
5167 c_phi_inv21=ths->c_phi_inv[2];
5168 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5170 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5171 f_hat111=f_hat + (k0*N1+k1)*N2;
5172 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5173 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5174 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5175 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5176 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5177 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5179 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5180 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5181 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5182 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5183 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5184 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5185 g_hat222=g_hat + (k0*n1+k1)*n2;
5186 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5188 for(k2=0;k2<N2/2;k2++)
5190 ck21=c_phi_inv21[k2];
5191 ck22=c_phi_inv22[k2];
5193 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5194 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5195 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5196 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5198 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5199 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5200 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5201 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5208 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5210 for(k0=0;k0<N0/2;k0++)
5212 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5213 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5214 for(k1=0;k1<N1/2;k1++)
5216 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5217 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5219 for(k2=0;k2<N2/2;k2++)
5221 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5222 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5224 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5225 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5226 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5227 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5229 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5230 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5231 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5232 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5240 FFTW(execute)(ths->my_fftw_plan1);
5244 nfft_trafo_3d_B(ths);
5248 void X(adjoint_3d)(
X(plan) *ths)
5250 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5252 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5253 R ck01, ck02, ck11, ck12, ck21, ck22;
5254 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5255 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5267 f_hat=(C*)ths->f_hat;
5268 g_hat=(C*)ths->g_hat;
5271 nfft_adjoint_3d_B(ths);
5275 FFTW(execute)(ths->my_fftw_plan2);
5279 if(ths->flags & PRE_PHI_HUT)
5281 c_phi_inv01=ths->c_phi_inv[0];
5282 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5285 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5287 for(k0=0;k0<N0/2;k0++)
5289 ck01=c_phi_inv01[k0];
5290 ck02=c_phi_inv02[k0];
5291 c_phi_inv11=ths->c_phi_inv[1];
5292 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5294 for(k1=0;k1<N1/2;k1++)
5296 ck11=c_phi_inv11[k1];
5297 ck12=c_phi_inv12[k1];
5298 c_phi_inv21=ths->c_phi_inv[2];
5299 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5301 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5302 f_hat111=f_hat + (k0*N1+k1)*N2;
5303 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5304 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5305 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5306 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5307 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5308 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5310 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5311 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5312 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5313 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5314 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5315 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5316 g_hat222=g_hat + (k0*n1+k1)*n2;
5317 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5319 for(k2=0;k2<N2/2;k2++)
5321 ck21=c_phi_inv21[k2];
5322 ck22=c_phi_inv22[k2];
5324 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5325 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5326 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5327 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5329 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5330 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5331 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5332 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5339 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5341 for(k0=0;k0<N0/2;k0++)
5343 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5344 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5345 for(k1=0;k1<N1/2;k1++)
5347 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5348 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5350 for(k2=0;k2<N2/2;k2++)
5352 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5353 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5355 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5356 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5357 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5358 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5360 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5361 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5362 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5363 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5373 void X(trafo)(
X(plan) *ths)
5377 case 1:
X(trafo_1d)(ths);
break;
5378 case 2:
X(trafo_2d)(ths);
break;
5379 case 3:
X(trafo_3d)(ths);
break;
5383 ths->g_hat = ths->g1;
5398 FFTW(execute)(ths->my_fftw_plan1);
5411 void X(adjoint)(
X(plan) *ths)
5415 case 1:
X(adjoint_1d)(ths);
break;
5416 case 2:
X(adjoint_2d)(ths);
break;
5417 case 3:
X(adjoint_3d)(ths);
break;
5436 FFTW(execute)(ths->my_fftw_plan2);
5452 static
void precompute_phi_hut(
X(plan) *ths)
5457 ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) *
sizeof(R*));
5459 for (t = 0; t < ths->d; t++)
5461 ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) *
sizeof(R));
5463 for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5465 ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5474 void X(precompute_lin_psi)(
X(plan) *ths)
5480 for (t=0; t<ths->d; t++)
5482 step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5483 for(j = 0;j <= ths->K; j++)
5485 ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5490 void X(precompute_fg_psi)(
X(plan) *ths)
5497 for (t=0; t<ths->d; t++)
5501 #pragma omp parallel for default(shared) private(j,u,o)
5503 for (j = 0; j < ths->M_total; j++)
5507 ths->psi[2*(j*ths->d+t)]=
5508 (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5510 ths->psi[2*(j*ths->d+t)+1]=
5511 EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5517 void X(precompute_psi)(
X(plan) *ths)
5526 for (t=0; t<ths->d; t++)
5530 #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5532 for (j = 0; j < ths->M_total; j++)
5536 for(l = u, lj = 0; l <= o; l++, lj++)
5537 ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5538 (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5545 static void nfft_precompute_full_psi_omp(
X(plan) *ths)
5552 for(t=0,lprod = 1; t<ths->d; t++)
5553 lprod *= 2*ths->m+2;
5556 #pragma omp parallel for default(shared) private(j)
5557 for(j=0; j<ths->M_total; j++)
5563 INT ll_plain[ths->d+1];
5565 INT u[ths->d], o[ths->d];
5567 R phi_prod[ths->d+1];
5573 MACRO_init_uo_l_lj_t;
5575 for(l_L=0; l_L<lprod; l_L++, ix++)
5577 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5579 ths->psi_index_g[ix]=ll_plain[ths->d];
5580 ths->psi[ix]=phi_prod[ths->d];
5582 MACRO_count_uo_l_lj_t;
5585 ths->psi_index_f[j]=lprod;
5590 void X(precompute_full_psi)(
X(plan) *ths)
5595 nfft_precompute_full_psi_omp(ths);
5602 INT ll_plain[ths->d+1];
5604 INT u[ths->d], o[ths->d];
5606 R phi_prod[ths->d+1];
5612 phi_prod[0] = K(1.0);
5615 for (t = 0, lprod = 1; t < ths->d; t++)
5616 lprod *= 2 * ths->m + 2;
5618 for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5620 MACRO_init_uo_l_lj_t;
5622 for (l_L = 0; l_L < lprod; l_L++, ix++)
5624 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5626 ths->psi_index_g[ix] = ll_plain[ths->d];
5627 ths->psi[ix] = phi_prod[ths->d];
5629 MACRO_count_uo_l_lj_t;
5632 ths->psi_index_f[j] = ix - ix_old;
5638 void X(precompute_one_psi)(
X(plan) *ths)
5640 if(ths->flags & PRE_LIN_PSI)
5641 X(precompute_lin_psi)(ths);
5642 if(ths->flags & PRE_FG_PSI)
5643 X(precompute_fg_psi)(ths);
5644 if(ths->flags & PRE_PSI)
5645 X(precompute_psi)(ths);
5646 if(ths->flags & PRE_FULL_PSI)
5647 X(precompute_full_psi)(ths);
5650 static void init_help(
X(plan) *ths)
5655 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5656 ths->flags |= NFFT_SORT_NODES;
5658 ths->N_total = intprod(ths->N, 0, ths->d);
5659 ths->n_total = intprod(ths->n, 0, ths->d);
5661 ths->sigma = (R*) Y(malloc)((size_t)(ths->d) *
sizeof(R));
5663 for(t = 0;t < ths->d; t++)
5664 ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5668 if(ths->flags & MALLOC_X)
5669 ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) *
sizeof(R));
5671 if(ths->flags & MALLOC_F_HAT)
5672 ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) *
sizeof(C));
5674 if(ths->flags & MALLOC_F)
5675 ths->f = (C*)Y(malloc)((size_t)(ths->M_total) *
sizeof(C));
5677 if(ths->flags & PRE_PHI_HUT)
5678 precompute_phi_hut(ths);
5680 if (ths->flags & PRE_LIN_PSI)
5684 ths->K = Y(m2K)(ths->m);
5686 ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) *
sizeof(R));
5689 if(ths->flags & PRE_FG_PSI)
5690 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) *
sizeof(R));
5692 if(ths->flags & PRE_PSI)
5693 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) *
sizeof(R));
5695 if(ths->flags & PRE_FULL_PSI)
5697 for (t = 0, lprod = 1; t < ths->d; t++)
5698 lprod *= 2 * ths->m + 2;
5700 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(R));
5702 ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) *
sizeof(INT));
5703 ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(INT));
5706 if(ths->flags & FFTW_INIT)
5709 INT nthreads = Y(get_num_threads)();
5712 ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
5714 if(ths->flags & FFT_OUT_OF_PLACE)
5715 ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
5720 #pragma omp critical (nfft_omp_critical_fftw_plan)
5722 FFTW(plan_with_nthreads)(nthreads);
5725 int *_n = Y(malloc)((size_t)(ths->d) *
sizeof(int));
5727 for (t = 0; t < ths->d; t++)
5728 _n[t] = (
int)(ths->n[t]);
5730 ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5731 ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
5739 if(ths->flags & NFFT_SORT_NODES)
5740 ths->index_x = (INT*) Y(malloc)(
sizeof(INT) * 2U * (
size_t)(ths->M_total));
5742 ths->index_x = NULL;
5744 ths->mv_trafo = (void (*) (
void* ))
X(trafo);
5745 ths->mv_adjoint = (void (*) (
void* ))
X(adjoint);
5748 void X(init)(
X(plan) *ths,
int d,
int *N,
int M_total)
5754 ths->N = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
5756 for (t = 0; t < d; t++)
5757 ths->N[t] = (INT)N[t];
5759 ths->M_total = (INT)M_total;
5761 ths->n = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
5763 for (t = 0; t < d; t++)
5764 ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
5766 ths->m = WINDOW_HELP_ESTIMATE_m;
5771 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5772 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5773 NFFT_OMP_BLOCKWISE_ADJOINT;
5775 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5776 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5780 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5781 FFTW_INIT | FFT_OUT_OF_PLACE;
5783 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5789 void X(init_guru)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
5790 unsigned flags,
unsigned fftw_flags)
5795 ths->M_total = (INT)M_total;
5796 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5798 for (t = 0; t < d; t++)
5799 ths->N[t] = (INT)N[t];
5801 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5803 for (t = 0; t < d; t++)
5804 ths->n[t] = (INT)n[t];
5809 ths->fftw_flags = fftw_flags;
5815 void X(init_lin)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
int K,
5816 unsigned flags,
unsigned fftw_flags)
5821 ths->M_total = (INT)M_total;
5822 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5824 for (t = 0; t < d; t++)
5825 ths->N[t] = (INT)N[t];
5827 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5829 for (t = 0; t < d; t++)
5830 ths->n[t] = (INT)n[t];
5835 ths->fftw_flags = fftw_flags;
5841 void X(init_1d)(
X(plan) *ths,
int N1,
int M_total)
5847 X(init)(ths, 1, N, M_total);
5850 void X(init_2d)(
X(plan) *ths,
int N1,
int N2,
int M_total)
5856 X(init)(ths, 2, N, M_total);
5859 void X(init_3d)(
X(plan) *ths,
int N1,
int N2,
int N3,
int M_total)
5866 X(init)(ths, 3, N, M_total);
5869 const char*
X(check)(
X(plan) *ths)
5874 return "Member f not initialized.";
5877 return "Member x not initialized.";
5880 return "Member f_hat not initialized.";
5882 if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
5883 return "Number of nodes too small to use PRE_LIN_PSI.";
5885 for (j = 0; j < ths->M_total * ths->d; j++)
5887 if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5889 return "ths->x out of range [-0.5,0.5)";
5893 for (j = 0; j < ths->d; j++)
5895 if (ths->sigma[j] <= 1)
5896 return "Oversampling factor too small";
5898 if(ths->N[j] <= ths->m)
5899 return "Polynomial degree N is smaller than cut-off m";
5901 if(ths->N[j]%2 == 1)
5902 return "polynomial degree N has to be even";
5907 void X(finalize)(
X(plan) *ths)
5911 if(ths->flags & NFFT_SORT_NODES)
5912 Y(free)(ths->index_x);
5914 if(ths->flags & FFTW_INIT)
5917 #pragma omp critical (nfft_omp_critical_fftw_plan)
5919 FFTW(destroy_plan)(ths->my_fftw_plan2);
5921 #pragma omp critical (nfft_omp_critical_fftw_plan)
5923 FFTW(destroy_plan)(ths->my_fftw_plan1);
5925 if(ths->flags & FFT_OUT_OF_PLACE)
5931 if(ths->flags & PRE_FULL_PSI)
5933 Y(free)(ths->psi_index_g);
5934 Y(free)(ths->psi_index_f);
5938 if(ths->flags & PRE_PSI)
5941 if(ths->flags & PRE_FG_PSI)
5944 if(ths->flags & PRE_LIN_PSI)
5947 if(ths->flags & PRE_PHI_HUT)
5949 for (t = 0; t < ths->d; t++)
5950 Y(free)(ths->c_phi_inv[t]);
5951 Y(free)(ths->c_phi_inv);
5954 if(ths->flags & MALLOC_F)
5957 if(ths->flags & MALLOC_F_HAT)
5958 Y(free)(ths->f_hat);
5960 if(ths->flags & MALLOC_X)
5963 WINDOW_HELP_FINALIZE;
5965 Y(free)(ths->sigma);
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
#define X(name)
Include header for C99 complex datatype.
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.