NFFT  3.3.0
nfft.c
1 /*
2  * Copyright (c) 2002, 2015 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* $Id$ */
20 
21 /* Nonequispaced FFT */
22 
23 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
24 
25 /* configure header */
26 #include "config.h"
27 
28 /* complex datatype (maybe) */
29 #ifdef HAVE_COMPLEX_H
30 #include<complex.h>
31 #endif
32 
33 /* NFFT headers */
34 #include "nfft3.h"
35 #include "infft.h"
36 
37 #ifdef _OPENMP
38 #include <omp.h>
39 #endif
40 
41 #ifdef OMP_ASSERT
42 #include <assert.h>
43 #endif
44 
45 #undef X
46 #define X(name) NFFT(name)
47 
49 static inline INT intprod(const INT *vec, const INT a, const INT d)
50 {
51  INT t, p;
52 
53  p = 1;
54  for (t = 0; t < d; t++)
55  p *= vec[t] - a;
56 
57  return p;
58 }
59 
60 /* handy shortcuts */
61 #define BASE(x) CEXP(x)
62 
77 static inline void sort0(const INT d, const INT *n, const INT m,
78  const INT local_x_num, const R *local_x, INT *ar_x)
79 {
80  INT u_j[d], i, j, help, rhigh;
81  INT *ar_x_temp;
82  INT nprod;
83 
84  for (i = 0; i < local_x_num; i++)
85  {
86  ar_x[2 * i] = 0;
87  ar_x[2 *i + 1] = i;
88  for (j = 0; j < d; j++)
89  {
90  help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
91  u_j[j] = (help % n[j] + n[j]) % n[j];
92 
93  ar_x[2 * i] += u_j[j];
94  if (j + 1 < d)
95  ar_x[2 * i] *= n[j + 1];
96  }
97  }
98 
99  for (j = 0, nprod = 1; j < d; j++)
100  nprod *= n[j];
101 
102  rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
103 
104  ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT));
105  Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
106 #ifdef OMP_ASSERT
107  for (i = 1; i < local_x_num; i++)
108  assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
109 #endif
110  Y(free)(ar_x_temp);
111 }
112 
121 static inline void sort(const X(plan) *ths)
122 {
123  if (ths->flags & NFFT_SORT_NODES)
124  sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
125 }
126 
147 void X(trafo_direct)(const X(plan) *ths)
148 {
149  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
150 
151  memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
152 
153  if (ths->d == 1)
154  {
155  /* specialize for univariate case, rationale: faster */
156  INT j;
157 #ifdef _OPENMP
158  #pragma omp parallel for default(shared) private(j)
159 #endif
160  for (j = 0; j < ths->M_total; j++)
161  {
162  INT k_L;
163  for (k_L = 0; k_L < ths->N_total; k_L++)
164  {
165  R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
166  f[j] += f_hat[k_L] * BASE(-II * omega);
167  }
168  }
169  }
170  else
171  {
172  /* multivariate case */
173  INT j;
174 #ifdef _OPENMP
175  #pragma omp parallel for default(shared) private(j)
176 #endif
177  for (j = 0; j < ths->M_total; j++)
178  {
179  R x[ths->d], omega, Omega[ths->d + 1];
180  INT t, t2, k_L, k[ths->d];
181  Omega[0] = K(0.0);
182  for (t = 0; t < ths->d; t++)
183  {
184  k[t] = -ths->N[t]/2;
185  x[t] = K2PI * ths->x[j * ths->d + t];
186  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
187  }
188  omega = Omega[ths->d];
189 
190  for (k_L = 0; k_L < ths->N_total; k_L++)
191  {
192  f[j] += f_hat[k_L] * BASE(-II * omega);
193  {
194  for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
195  k[t]-= ths->N[t]-1;
196 
197  k[t]++;
198 
199  for (t2 = t; t2 < ths->d; t2++)
200  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
201 
202  omega = Omega[ths->d];
203  }
204  }
205  }
206  }
207 }
208 
209 void X(adjoint_direct)(const X(plan) *ths)
210 {
211  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
212 
213  memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
214 
215  if (ths->d == 1)
216  {
217  /* specialize for univariate case, rationale: faster */
218 #ifdef _OPENMP
219  INT k_L;
220  #pragma omp parallel for default(shared) private(k_L)
221  for (k_L = 0; k_L < ths->N_total; k_L++)
222  {
223  INT j;
224  for (j = 0; j < ths->M_total; j++)
225  {
226  R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
227  f_hat[k_L] += f[j] * BASE(II * omega);
228  }
229  }
230 #else
231  INT j;
232  for (j = 0; j < ths->M_total; j++)
233  {
234  INT k_L;
235  for (k_L = 0; k_L < ths->N_total; k_L++)
236  {
237  R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
238  f_hat[k_L] += f[j] * BASE(II * omega);
239  }
240  }
241 #endif
242  }
243  else
244  {
245  /* multivariate case */
246  INT j, k_L;
247 #ifdef _OPENMP
248  #pragma omp parallel for default(shared) private(j, k_L)
249  for (k_L = 0; k_L < ths->N_total; k_L++)
250  {
251  INT k[ths->d], k_temp, t;
252 
253  k_temp = k_L;
254 
255  for (t = ths->d - 1; t >= 0; t--)
256  {
257  k[t] = k_temp % ths->N[t] - ths->N[t]/2;
258  k_temp /= ths->N[t];
259  }
260 
261  for (j = 0; j < ths->M_total; j++)
262  {
263  R omega = K(0.0);
264  for (t = 0; t < ths->d; t++)
265  omega += k[t] * K2PI * ths->x[j * ths->d + t];
266  f_hat[k_L] += f[j] * BASE(II * omega);
267  }
268  }
269 #else
270  for (j = 0; j < ths->M_total; j++)
271  {
272  R x[ths->d], omega, Omega[ths->d+1];
273  INT t, t2, k[ths->d];
274  Omega[0] = K(0.0);
275  for (t = 0; t < ths->d; t++)
276  {
277  k[t] = -ths->N[t]/2;
278  x[t] = K2PI * ths->x[j * ths->d + t];
279  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
280  }
281  omega = Omega[ths->d];
282  for (k_L = 0; k_L < ths->N_total; k_L++)
283  {
284  f_hat[k_L] += f[j] * BASE(II * omega);
285 
286  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
287  k[t]-= ths->N[t]-1;
288 
289  k[t]++;
290 
291  for (t2 = t; t2 < ths->d; t2++)
292  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
293 
294  omega = Omega[ths->d];
295  }
296  }
297 #endif
298  }
299 }
300 
326 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
327  const INT act_dim)
328 {
329  const R xj = ths->x[j * ths->d + act_dim];
330  INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
331 
332  (*up) = c - (ths->m);
333  (*op) = c + 1 + (ths->m);
334 }
335 
336 static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m)
337 {
338  INT c = LRINT(FLOOR(x * (R)(n)));
339 
340  *u = (c - m + n) % n;
341  *o = (c + 1 + m + n) % n;
342 }
343 
344 #define MACRO_D_compute_A \
345 { \
346  g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
347 }
348 
349 #define MACRO_D_compute_T \
350 { \
351  f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
352 }
353 
354 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
355 
356 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
357 
358 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
359 
360 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
361 
362 #define MACRO_init_k_ks \
363 { \
364  for (t = ths->d-1; 0 <= t; t--) \
365  { \
366  kp[t] = k[t] = 0; \
367  ks[t] = ths->N[t]/2; \
368  } \
369  t++; \
370 }
371 
372 #define MACRO_update_c_phi_inv_k(which_one) \
373 { \
374  for (t2 = t; t2 < ths->d; t2++) \
375  { \
376  c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
377  ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
378  k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
379  } \
380 }
381 
382 #define MACRO_count_k_ks \
383 { \
384  for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
385  { \
386  kp[t] = k[t] = 0; \
387  ks[t]= ths->N[t]/2; \
388  } \
389 \
390  kp[t]++; k[t]++; ks[t]++; \
391  if(kp[t] == ths->N[t]/2) \
392  { \
393  k[t] = ths->n[t] - ths->N[t]/2; \
394  ks[t] = 0; \
395  } \
396 } \
397 
398 /* sub routines for the fast transforms matrix vector multiplication with D, D^T */
399 #define MACRO_D(which_one) \
400 static inline void D_serial_ ## which_one (X(plan) *ths) \
401 { \
402  C *f_hat, *g_hat; /* local copy */ \
403  R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
404  INT t, t2; /* index dimensions */ \
405  INT k_L; /* plain index */ \
406  INT kp[ths->d]; /* multi index (simple) */ \
407  INT k[ths->d]; /* multi index in g_hat */ \
408  INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \
409  INT k_plain[ths->d+1]; /* postfix plain index */ \
410  INT ks_plain[ths->d+1]; /* postfix plain index */ \
411  \
412  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
413  MACRO_D_init_result_ ## which_one; \
414 \
415  c_phi_inv_k[0] = K(1.0); \
416  k_plain[0] = 0; \
417  ks_plain[0] = 0; \
418 \
419  MACRO_init_k_ks; \
420 \
421  if (ths->flags & PRE_PHI_HUT) \
422  { \
423  for (k_L = 0; k_L < ths->N_total; k_L++) \
424  { \
425  MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
426  MACRO_D_compute_ ## which_one; \
427  MACRO_count_k_ks; \
428  } \
429  } \
430  else \
431  { \
432  for (k_L = 0; k_L < ths->N_total; k_L++) \
433  { \
434  MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
435  MACRO_D_compute_ ## which_one; \
436  MACRO_count_k_ks; \
437  } \
438  } \
439 }
440 
441 #ifdef _OPENMP
442 static inline void D_openmp_A(X(plan) *ths)
443 {
444  C *f_hat, *g_hat;
445  INT k_L;
447  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
448  memset(g_hat, 0, ths->n_total * sizeof(C));
449 
450  if (ths->flags & PRE_PHI_HUT)
451  {
452  #pragma omp parallel for default(shared) private(k_L)
453  for (k_L = 0; k_L < ths->N_total; k_L++)
454  {
455  INT kp[ths->d]; //0..N-1
456  INT k[ths->d];
457  INT ks[ths->d];
458  R c_phi_inv_k_val = K(1.0);
459  INT k_plain_val = 0;
460  INT ks_plain_val = 0;
461  INT t;
462  INT k_temp = k_L;
463 
464  for (t = ths->d-1; t >= 0; t--)
465  {
466  kp[t] = k_temp % ths->N[t];
467  if (kp[t] >= ths->N[t]/2)
468  k[t] = ths->n[t] - ths->N[t] + kp[t];
469  else
470  k[t] = kp[t];
471  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
472  k_temp /= ths->N[t];
473  }
474 
475  for (t = 0; t < ths->d; t++)
476  {
477  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
478  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
479  k_plain_val = k_plain_val*ths->n[t] + k[t];
480  }
481 
482  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
483  } /* for(k_L) */
484  } /* if(PRE_PHI_HUT) */
485  else
486  {
487  #pragma omp parallel for default(shared) private(k_L)
488  for (k_L = 0; k_L < ths->N_total; k_L++)
489  {
490  INT kp[ths->d]; //0..N-1
491  INT k[ths->d];
492  INT ks[ths->d];
493  R c_phi_inv_k_val = K(1.0);
494  INT k_plain_val = 0;
495  INT ks_plain_val = 0;
496  INT t;
497  INT k_temp = k_L;
498 
499  for (t = ths->d-1; t >= 0; t--)
500  {
501  kp[t] = k_temp % ths->N[t];
502  if (kp[t] >= ths->N[t]/2)
503  k[t] = ths->n[t] - ths->N[t] + kp[t];
504  else
505  k[t] = kp[t];
506  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
507  k_temp /= ths->N[t];
508  }
509 
510  for (t = 0; t < ths->d; t++)
511  {
512  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
513  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
514  k_plain_val = k_plain_val*ths->n[t] + k[t];
515  }
516 
517  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
518  } /* for(k_L) */
519  } /* else(PRE_PHI_HUT) */
520 }
521 #endif
522 
523 #ifndef _OPENMP
524 MACRO_D(A)
525 #endif
526 
527 static inline void D_A(X(plan) *ths)
528 {
529 #ifdef _OPENMP
530  D_openmp_A(ths);
531 #else
532  D_serial_A(ths);
533 #endif
534 }
535 
536 #ifdef _OPENMP
537 static void D_openmp_T(X(plan) *ths)
538 {
539  C *f_hat, *g_hat;
540  INT k_L;
542  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
543  memset(f_hat, 0, ths->N_total * sizeof(C));
544 
545  if (ths->flags & PRE_PHI_HUT)
546  {
547  #pragma omp parallel for default(shared) private(k_L)
548  for (k_L = 0; k_L < ths->N_total; k_L++)
549  {
550  INT kp[ths->d]; //0..N-1
551  INT k[ths->d];
552  INT ks[ths->d];
553  R c_phi_inv_k_val = K(1.0);
554  INT k_plain_val = 0;
555  INT ks_plain_val = 0;
556  INT t;
557  INT k_temp = k_L;
558 
559  for (t = ths->d - 1; t >= 0; t--)
560  {
561  kp[t] = k_temp % ths->N[t];
562  if (kp[t] >= ths->N[t]/2)
563  k[t] = ths->n[t] - ths->N[t] + kp[t];
564  else
565  k[t] = kp[t];
566  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
567  k_temp /= ths->N[t];
568  }
569 
570  for (t = 0; t < ths->d; t++)
571  {
572  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
573  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
574  k_plain_val = k_plain_val*ths->n[t] + k[t];
575  }
576 
577  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
578  } /* for(k_L) */
579  } /* if(PRE_PHI_HUT) */
580  else
581  {
582  #pragma omp parallel for default(shared) private(k_L)
583  for (k_L = 0; k_L < ths->N_total; k_L++)
584  {
585  INT kp[ths->d]; //0..N-1
586  INT k[ths->d];
587  INT ks[ths->d];
588  R c_phi_inv_k_val = K(1.0);
589  INT k_plain_val = 0;
590  INT ks_plain_val = 0;
591  INT t;
592  INT k_temp = k_L;
593 
594  for (t = ths->d-1; t >= 0; t--)
595  {
596  kp[t] = k_temp % ths->N[t];
597  if (kp[t] >= ths->N[t]/2)
598  k[t] = ths->n[t] - ths->N[t] + kp[t];
599  else
600  k[t] = kp[t];
601  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
602  k_temp /= ths->N[t];
603  }
604 
605  for (t = 0; t < ths->d; t++)
606  {
607  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
608  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
609  k_plain_val = k_plain_val*ths->n[t] + k[t];
610  }
611 
612  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
613  } /* for(k_L) */
614  } /* else(PRE_PHI_HUT) */
615 }
616 #endif
617 
618 #ifndef _OPENMP
619 MACRO_D(T)
620 #endif
621 
622 static void D_T(X(plan) *ths)
623 {
624 #ifdef _OPENMP
625  D_openmp_T(ths);
626 #else
627  D_serial_T(ths);
628 #endif
629 }
630 
631 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */
632 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
633 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
634 
635 #define MACRO_B_PRE_FULL_PSI_compute_A \
636 { \
637  (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
638 }
639 
640 #define MACRO_B_PRE_FULL_PSI_compute_T \
641 { \
642  g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
643 }
644 
645 #define MACRO_B_compute_A \
646 { \
647  (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
648 }
649 
650 #define MACRO_B_compute_T \
651 { \
652  g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
653 }
654 
655 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
656 
657 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
658 
659 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
660  - ((R)l[t2])/((R)ths->n[t2]), t2)
661 
662 #define MACRO_init_uo_l_lj_t \
663 { \
664  for (t = ths->d-1; t >= 0; t--) \
665  { \
666  uo(ths,j,&u[t],&o[t],t); \
667  l[t] = u[t]; \
668  lj[t] = 0; \
669  } \
670  t++; \
671 }
672 
673 #define MACRO_update_phi_prod_ll_plain(which_one) { \
674  for (t2 = t; t2 < ths->d; t2++) \
675  { \
676  phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
677  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
678  } \
679 }
680 
681 #define MACRO_count_uo_l_lj_t \
682 { \
683  for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
684  { \
685  l[t] = u[t]; \
686  lj[t] = 0; \
687  } \
688  \
689  l[t]++; \
690  lj[t]++; \
691 }
692 
693 #define MACRO_B(which_one) \
694 static inline void B_serial_ ## which_one (X(plan) *ths) \
695 { \
696  INT lprod; /* 'regular bandwidth' of matrix B */ \
697  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
698  INT t, t2; /* index dimensions */ \
699  INT j; /* index nodes */ \
700  INT l_L, ix; /* index one row of B */ \
701  INT l[ths->d]; /* multi index u<=l<=o */ \
702  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
703  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
704  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
705  C *f, *g; /* local copy */ \
706  C *fj; /* local copy */ \
707  R y[ths->d]; \
708  R fg_psi[ths->d][2*ths->m+2]; \
709  R fg_exp_l[ths->d][2*ths->m+2]; \
710  INT l_fg,lj_fg; \
711  R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
712  R ip_w; \
713  INT ip_u; \
714  INT ip_s = ths->K/(ths->m+2); \
715  \
716  f = (C*)ths->f; g = (C*)ths->g; \
717  \
718  MACRO_B_init_result_ ## which_one; \
719  \
720  if (ths->flags & PRE_FULL_PSI) \
721  { \
722  for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
723  { \
724  for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
725  { \
726  MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
727  } \
728  } \
729  return; \
730  } \
731 \
732  phi_prod[0] = K(1.0); \
733  ll_plain[0] = 0; \
734 \
735  for (t = 0, lprod = 1; t < ths->d; t++) \
736  lprod *= (2 * ths->m + 2); \
737 \
738  if (ths->flags & PRE_PSI) \
739  { \
740  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
741  { \
742  MACRO_init_uo_l_lj_t; \
743  \
744  for (l_L = 0; l_L < lprod; l_L++) \
745  { \
746  MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
747  \
748  MACRO_B_compute_ ## which_one; \
749  \
750  MACRO_count_uo_l_lj_t; \
751  } /* for(l_L) */ \
752  } /* for(j) */ \
753  return; \
754  } /* if(PRE_PSI) */ \
755  \
756  if (ths->flags & PRE_FG_PSI) \
757  { \
758  for(t2 = 0; t2 < ths->d; t2++) \
759  { \
760  tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
761  tmpEXP2sq = tmpEXP2*tmpEXP2; \
762  tmp2 = K(1.0); \
763  tmp3 = K(1.0); \
764  fg_exp_l[t2][0] = K(1.0); \
765  for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
766  { \
767  tmp3 = tmp2*tmpEXP2; \
768  tmp2 *= tmpEXP2sq; \
769  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
770  } \
771  } \
772  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
773  { \
774  MACRO_init_uo_l_lj_t; \
775  \
776  for (t2 = 0; t2 < ths->d; t2++) \
777  { \
778  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
779  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
780  tmp1 = K(1.0); \
781  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
782  { \
783  tmp1 *= tmpEXP1; \
784  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
785  } \
786  } \
787  \
788  for (l_L= 0; l_L < lprod; l_L++) \
789  { \
790  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
791  \
792  MACRO_B_compute_ ## which_one; \
793  \
794  MACRO_count_uo_l_lj_t; \
795  } /* for(l_L) */ \
796  } /* for(j) */ \
797  return; \
798  } /* if(PRE_FG_PSI) */ \
799  \
800  if (ths->flags & FG_PSI) \
801  { \
802  for (t2 = 0; t2 < ths->d; t2++) \
803  { \
804  tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
805  tmpEXP2sq = tmpEXP2*tmpEXP2; \
806  tmp2 = K(1.0); \
807  tmp3 = K(1.0); \
808  fg_exp_l[t2][0] = K(1.0); \
809  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
810  { \
811  tmp3 = tmp2*tmpEXP2; \
812  tmp2 *= tmpEXP2sq; \
813  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
814  } \
815  } \
816  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
817  { \
818  MACRO_init_uo_l_lj_t; \
819  \
820  for (t2 = 0; t2 < ths->d; t2++) \
821  { \
822  fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
823  \
824  tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
825  /ths->b[t2]); \
826  tmp1 = K(1.0); \
827  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
828  { \
829  tmp1 *= tmpEXP1; \
830  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
831  } \
832  } \
833  \
834  for (l_L = 0; l_L < lprod; l_L++) \
835  { \
836  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
837  \
838  MACRO_B_compute_ ## which_one; \
839  \
840  MACRO_count_uo_l_lj_t; \
841  } /* for(l_L) */ \
842  } /* for(j) */ \
843  return; \
844  } /* if(FG_PSI) */ \
845  \
846  if (ths->flags & PRE_LIN_PSI) \
847  { \
848  for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
849  { \
850  MACRO_init_uo_l_lj_t; \
851  \
852  for (t2 = 0; t2 < ths->d; t2++) \
853  { \
854  y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
855  * ((R)(ths->K))) / (R)(ths->m + 2); \
856  ip_u = LRINT(FLOOR(y[t2])); \
857  ip_w = y[t2]-ip_u; \
858  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
859  { \
860  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
861  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
862  * (ip_w); \
863  } \
864  } \
865  \
866  for (l_L = 0; l_L < lprod; l_L++) \
867  { \
868  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
869  \
870  MACRO_B_compute_ ## which_one; \
871  \
872  MACRO_count_uo_l_lj_t; \
873  } /* for(l_L) */ \
874  } /* for(j) */ \
875  return; \
876  } /* if(PRE_LIN_PSI) */ \
877  \
878  /* no precomputed psi at all */ \
879  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
880  { \
881  MACRO_init_uo_l_lj_t; \
882  \
883  for (l_L = 0; l_L < lprod; l_L++) \
884  { \
885  MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
886  \
887  MACRO_B_compute_ ## which_one; \
888  \
889  MACRO_count_uo_l_lj_t; \
890  } /* for(l_L) */ \
891  } /* for(j) */ \
892 } /* nfft_B */ \
893 
894 #ifndef _OPENMP
895 MACRO_B(A)
896 #endif
897 
898 #ifdef _OPENMP
899 static inline void B_openmp_A (X(plan) *ths)
900 {
901  INT lprod; /* 'regular bandwidth' of matrix B */
902  INT k;
903 
904  memset(ths->f, 0, ths->M_total * sizeof(C));
905 
906  for (k = 0, lprod = 1; k < ths->d; k++)
907  lprod *= (2*ths->m+2);
908 
909  if (ths->flags & PRE_FULL_PSI)
910  {
911  #pragma omp parallel for default(shared) private(k)
912  for (k = 0; k < ths->M_total; k++)
913  {
914  INT l;
915  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
916  ths->f[j] = K(0.0);
917  for (l = 0; l < lprod; l++)
918  ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
919  }
920  return;
921  }
922 
923  if (ths->flags & PRE_PSI)
924  {
925  #pragma omp parallel for default(shared) private(k)
926  for (k = 0; k < ths->M_total; k++)
927  {
928  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
929  INT t, t2; /* index dimensions */
930  INT l_L; /* index one row of B */
931  INT l[ths->d]; /* multi index u<=l<=o */
932  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
933  INT ll_plain[ths->d+1]; /* postfix plain index in g */
934  R phi_prod[ths->d+1]; /* postfix product of PHI */
935  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
936 
937  phi_prod[0] = K(1.0);
938  ll_plain[0] = 0;
939 
940  MACRO_init_uo_l_lj_t;
941 
942  for (l_L = 0; l_L < lprod; l_L++)
943  {
944  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
945 
946  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
947 
948  MACRO_count_uo_l_lj_t;
949  } /* for(l_L) */
950  } /* for(j) */
951  return;
952  } /* if(PRE_PSI) */
953 
954  if (ths->flags & PRE_FG_PSI)
955  {
956  INT t, t2; /* index dimensions */
957  R fg_exp_l[ths->d][2*ths->m+2];
958 
959  for (t2 = 0; t2 < ths->d; t2++)
960  {
961  INT lj_fg;
962  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
963  R tmpEXP2sq = tmpEXP2*tmpEXP2;
964  R tmp2 = K(1.0);
965  R tmp3 = K(1.0);
966  fg_exp_l[t2][0] = K(1.0);
967  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
968  {
969  tmp3 = tmp2*tmpEXP2;
970  tmp2 *= tmpEXP2sq;
971  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
972  }
973  }
974 
975  #pragma omp parallel for default(shared) private(k,t,t2)
976  for (k = 0; k < ths->M_total; k++)
977  {
978  INT ll_plain[ths->d+1]; /* postfix plain index in g */
979  R phi_prod[ths->d+1]; /* postfix product of PHI */
980  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
981  INT l[ths->d]; /* multi index u<=l<=o */
982  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
983  R fg_psi[ths->d][2*ths->m+2];
984  R tmpEXP1, tmp1;
985  INT l_fg,lj_fg;
986  INT l_L;
987  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
988 
989  phi_prod[0] = K(1.0);
990  ll_plain[0] = 0;
991 
992  MACRO_init_uo_l_lj_t;
993 
994  for (t2 = 0; t2 < ths->d; t2++)
995  {
996  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
997  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
998  tmp1 = K(1.0);
999  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1000  {
1001  tmp1 *= tmpEXP1;
1002  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1003  }
1004  }
1005 
1006  for (l_L= 0; l_L < lprod; l_L++)
1007  {
1008  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1009 
1010  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1011 
1012  MACRO_count_uo_l_lj_t;
1013  } /* for(l_L) */
1014  } /* for(j) */
1015  return;
1016  } /* if(PRE_FG_PSI) */
1017 
1018  if (ths->flags & FG_PSI)
1019  {
1020  INT t, t2; /* index dimensions */
1021  R fg_exp_l[ths->d][2*ths->m+2];
1022 
1023  sort(ths);
1024 
1025  for (t2 = 0; t2 < ths->d; t2++)
1026  {
1027  INT lj_fg;
1028  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1029  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1030  R tmp2 = K(1.0);
1031  R tmp3 = K(1.0);
1032  fg_exp_l[t2][0] = K(1.0);
1033  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1034  {
1035  tmp3 = tmp2*tmpEXP2;
1036  tmp2 *= tmpEXP2sq;
1037  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1038  }
1039  }
1040 
1041  #pragma omp parallel for default(shared) private(k,t,t2)
1042  for (k = 0; k < ths->M_total; k++)
1043  {
1044  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1045  R phi_prod[ths->d+1]; /* postfix product of PHI */
1046  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1047  INT l[ths->d]; /* multi index u<=l<=o */
1048  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1049  R fg_psi[ths->d][2*ths->m+2];
1050  R tmpEXP1, tmp1;
1051  INT l_fg,lj_fg;
1052  INT l_L;
1053  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1054 
1055  phi_prod[0] = K(1.0);
1056  ll_plain[0] = 0;
1057 
1058  MACRO_init_uo_l_lj_t;
1059 
1060  for (t2 = 0; t2 < ths->d; t2++)
1061  {
1062  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1063 
1064  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1065  /ths->b[t2]);
1066  tmp1 = K(1.0);
1067  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1068  {
1069  tmp1 *= tmpEXP1;
1070  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1071  }
1072  }
1073 
1074  for (l_L = 0; l_L < lprod; l_L++)
1075  {
1076  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1077 
1078  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1079 
1080  MACRO_count_uo_l_lj_t;
1081  } /* for(l_L) */
1082  } /* for(j) */
1083  return;
1084  } /* if(FG_PSI) */
1085 
1086  if (ths->flags & PRE_LIN_PSI)
1087  {
1088  sort(ths);
1089 
1090  #pragma omp parallel for default(shared) private(k)
1091  for (k = 0; k<ths->M_total; k++)
1092  {
1093  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1094  INT t, t2; /* index dimensions */
1095  INT l_L; /* index one row of B */
1096  INT l[ths->d]; /* multi index u<=l<=o */
1097  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1098  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1099  R phi_prod[ths->d+1]; /* postfix product of PHI */
1100  R y[ths->d];
1101  R fg_psi[ths->d][2*ths->m+2];
1102  INT l_fg,lj_fg;
1103  R ip_w;
1104  INT ip_u;
1105  INT ip_s = ths->K/(ths->m+2);
1106  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1107 
1108  phi_prod[0] = K(1.0);
1109  ll_plain[0] = 0;
1110 
1111  MACRO_init_uo_l_lj_t;
1112 
1113  for (t2 = 0; t2 < ths->d; t2++)
1114  {
1115  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1116  * ((R)ths->K))/(ths->m+2);
1117  ip_u = LRINT(FLOOR(y[t2]));
1118  ip_w = y[t2]-ip_u;
1119  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1120  {
1121  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1122  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1123  * (ip_w);
1124  }
1125  }
1126 
1127  for (l_L = 0; l_L < lprod; l_L++)
1128  {
1129  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1130 
1131  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1132 
1133  MACRO_count_uo_l_lj_t;
1134  } /* for(l_L) */
1135  } /* for(j) */
1136  return;
1137  } /* if(PRE_LIN_PSI) */
1138 
1139  /* no precomputed psi at all */
1140  sort(ths);
1141 
1142  #pragma omp parallel for default(shared) private(k)
1143  for (k = 0; k < ths->M_total; k++)
1144  {
1145  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1146  INT t, t2; /* index dimensions */
1147  INT l_L; /* index one row of B */
1148  INT l[ths->d]; /* multi index u<=l<=o */
1149  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1150  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1151  R phi_prod[ths->d+1]; /* postfix product of PHI */
1152  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1153 
1154  phi_prod[0] = K(1.0);
1155  ll_plain[0] = 0;
1156 
1157  MACRO_init_uo_l_lj_t;
1158 
1159  for (l_L = 0; l_L < lprod; l_L++)
1160  {
1161  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1162 
1163  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1164 
1165  MACRO_count_uo_l_lj_t;
1166  } /* for(l_L) */
1167  } /* for(j) */
1168 }
1169 #endif
1170 
1171 static void B_A(X(plan) *ths)
1172 {
1173 #ifdef _OPENMP
1174  B_openmp_A(ths);
1175 #else
1176  B_serial_A(ths);
1177 #endif
1178 }
1179 
1180 #ifdef _OPENMP
1181 
1196 static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key)
1197 {
1198  INT left = 0, right = len - 1;
1199 
1200  if (len == 1)
1201  return 0;
1202 
1203  while (left < right - 1)
1204  {
1205  INT i = (left + right) / 2;
1206  if (ar_x[2*i] >= key)
1207  right = i;
1208  else if (ar_x[2*i] < key)
1209  left = i;
1210  }
1211 
1212  if (ar_x[2*left] < key && left != len-1)
1213  return left+1;
1214 
1215  return left;
1216 }
1217 #endif
1218 
1219 #ifdef _OPENMP
1220 
1235 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1236  INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d,
1237  const INT *n, const INT m)
1238 {
1239  const INT n0 = n[0];
1240  INT k;
1241  INT nthreads = omp_get_num_threads();
1242  INT nthreads_used = MIN(nthreads, n0);
1243  INT size_per_thread = n0 / nthreads_used;
1244  INT size_left = n0 - size_per_thread * nthreads_used;
1245  INT size_g[nthreads_used];
1246  INT offset_g[nthreads_used];
1247  INT my_id = omp_get_thread_num();
1248  INT n_prod_rest = 1;
1249 
1250  for (k = 1; k < d; k++)
1251  n_prod_rest *= n[k];
1252 
1253  *min_u_a = -1;
1254  *max_u_a = -1;
1255  *min_u_b = -1;
1256  *max_u_b = -1;
1257  *my_u0 = -1;
1258  *my_o0 = -1;
1259 
1260  if (my_id < nthreads_used)
1261  {
1262  const INT m22 = 2 * m + 2;
1263 
1264  offset_g[0] = 0;
1265  for (k = 0; k < nthreads_used; k++)
1266  {
1267  if (k > 0)
1268  offset_g[k] = offset_g[k-1] + size_g[k-1];
1269  size_g[k] = size_per_thread;
1270  if (size_left > 0)
1271  {
1272  size_g[k]++;
1273  size_left--;
1274  }
1275  }
1276 
1277  *my_u0 = offset_g[my_id];
1278  *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1279 
1280  if (nthreads_used > 1)
1281  {
1282  *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1283  *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1284  }
1285  else
1286  {
1287  *min_u_a = 0;
1288  *max_u_a = n_prod_rest * n0 - 1;
1289  }
1290 
1291  if (*min_u_a < 0)
1292  {
1293  *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1294  *max_u_b = n_prod_rest * n0 - 1;
1295  *min_u_a = 0;
1296  }
1297 
1298  if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1299  {
1300  *max_u_a = *max_u_b;
1301  *min_u_b = -1;
1302  *max_u_b = -1;
1303  }
1304 #ifdef OMP_ASSERT
1305  assert(*min_u_a <= *max_u_a);
1306  assert(*min_u_b <= *max_u_b);
1307  assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1308 #endif
1309  }
1310 }
1311 #endif
1312 
1321 static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g,
1322  const R *psi, const C *f, const INT M, const INT d, const INT *n,
1323  const INT m, const unsigned flags, const INT *index_x)
1324 {
1325  INT k;
1326  INT lprod;
1327 #ifdef _OPENMP
1328  INT lprod_m1;
1329 #endif
1330 #ifndef _OPENMP
1331  UNUSED(n);
1332 #endif
1333  {
1334  INT t;
1335  for(t = 0, lprod = 1; t < d; t++)
1336  lprod *= 2 * m + 2;
1337  }
1338 #ifdef _OPENMP
1339  lprod_m1 = lprod / (2 * m + 2);
1340 #endif
1341 
1342 #ifdef _OPENMP
1343  if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1344  {
1345  #pragma omp parallel private(k)
1346  {
1347  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1348  const INT *ar_x = index_x;
1349  INT n_prod_rest = 1;
1350 
1351  for (k = 1; k < d; k++)
1352  n_prod_rest *= n[k];
1353 
1354  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1355 
1356  if (min_u_a != -1)
1357  {
1358  k = index_x_binary_search(ar_x, M, min_u_a);
1359 #ifdef OMP_ASSERT
1360  assert(ar_x[2*k] >= min_u_a || k == M-1);
1361  if (k > 0)
1362  assert(ar_x[2*k-2] < min_u_a);
1363 #endif
1364  while (k < M)
1365  {
1366  INT l0, lrest;
1367  INT u_prod = ar_x[2*k];
1368  INT j = ar_x[2*k+1];
1369 
1370  if (u_prod < min_u_a || u_prod > max_u_a)
1371  break;
1372 
1373  for (l0 = 0; l0 < 2 * m + 2; l0++)
1374  {
1375  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1376 
1377  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1378  continue;
1379 
1380  for (lrest = 0; lrest < lprod_m1; lrest++)
1381  {
1382  const INT l = l0 * lprod_m1 + lrest;
1383  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1384  }
1385  }
1386 
1387  k++;
1388  }
1389  }
1390 
1391  if (min_u_b != -1)
1392  {
1393  k = index_x_binary_search(ar_x, M, min_u_b);
1394 #ifdef OMP_ASSERT
1395  assert(ar_x[2*k] >= min_u_b || k == M-1);
1396  if (k > 0)
1397  assert(ar_x[2*k-2] < min_u_b);
1398 #endif
1399  while (k < M)
1400  {
1401  INT l0, lrest;
1402  INT u_prod = ar_x[2*k];
1403  INT j = ar_x[2*k+1];
1404 
1405  if (u_prod < min_u_b || u_prod > max_u_b)
1406  break;
1407 
1408  for (l0 = 0; l0 < 2 * m + 2; l0++)
1409  {
1410  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1411 
1412  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1413  continue;
1414  for (lrest = 0; lrest < lprod_m1; lrest++)
1415  {
1416  const INT l = l0 * lprod_m1 + lrest;
1417  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1418  }
1419  }
1420 
1421  k++;
1422  }
1423  }
1424  } /* omp parallel */
1425  return;
1426  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
1427 #endif
1428 
1429 #ifdef _OPENMP
1430  #pragma omp parallel for default(shared) private(k)
1431 #endif
1432  for (k = 0; k < M; k++)
1433  {
1434  INT l;
1435  INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1436 
1437  for (l = 0; l < lprod; l++)
1438  {
1439 #ifdef _OPENMP
1440  C val = psi[j * lprod + l] * f[j];
1441  C *gref = g + psi_index_g[j * lprod + l];
1442  R *gref_real = (R*) gref;
1443 
1444  #pragma omp atomic
1445  gref_real[0] += creal(val);
1446 
1447  #pragma omp atomic
1448  gref_real[1] += cimag(val);
1449 #else
1450  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1451 #endif
1452  }
1453  }
1454 }
1455 
1456 #ifndef _OPENMP
1457 MACRO_B(T)
1458 #endif
1459 
1460 #ifdef _OPENMP
1461 static inline void B_openmp_T(X(plan) *ths)
1462 {
1463  INT lprod; /* 'regular bandwidth' of matrix B */
1464  INT k;
1465 
1466  memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
1467 
1468  for (k = 0, lprod = 1; k < ths->d; k++)
1469  lprod *= (2*ths->m+2);
1470 
1471  if (ths->flags & PRE_FULL_PSI)
1472  {
1473  nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1474  ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1475  return;
1476  }
1477 
1478  if (ths->flags & PRE_PSI)
1479  {
1480  #pragma omp parallel for default(shared) private(k)
1481  for (k = 0; k < ths->M_total; k++)
1482  {
1483  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1484  INT t, t2; /* index dimensions */
1485  INT l_L; /* index one row of B */
1486  INT l[ths->d]; /* multi index u<=l<=o */
1487  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1488  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1489  R phi_prod[ths->d+1]; /* postfix product of PHI */
1490  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1491 
1492  phi_prod[0] = K(1.0);
1493  ll_plain[0] = 0;
1494 
1495  MACRO_init_uo_l_lj_t;
1496 
1497  for (l_L = 0; l_L < lprod; l_L++)
1498  {
1499  C *lhs;
1500  R *lhs_real;
1501  C val;
1502 
1503  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1504 
1505  lhs = ths->g + ll_plain[ths->d];
1506  lhs_real = (R*)lhs;
1507  val = phi_prod[ths->d] * ths->f[j];
1508 
1509  #pragma omp atomic
1510  lhs_real[0] += CREAL(val);
1511 
1512  #pragma omp atomic
1513  lhs_real[1] += CIMAG(val);
1514 
1515  MACRO_count_uo_l_lj_t;
1516  } /* for(l_L) */
1517  } /* for(j) */
1518  return;
1519  } /* if(PRE_PSI) */
1520 
1521  if (ths->flags & PRE_FG_PSI)
1522  {
1523  INT t, t2; /* index dimensions */
1524  R fg_exp_l[ths->d][2*ths->m+2];
1525  for(t2 = 0; t2 < ths->d; t2++)
1526  {
1527  INT lj_fg;
1528  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1529  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1530  R tmp2 = K(1.0);
1531  R tmp3 = K(1.0);
1532  fg_exp_l[t2][0] = K(1.0);
1533  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1534  {
1535  tmp3 = tmp2*tmpEXP2;
1536  tmp2 *= tmpEXP2sq;
1537  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1538  }
1539  }
1540 
1541  #pragma omp parallel for default(shared) private(k,t,t2)
1542  for (k = 0; k < ths->M_total; k++)
1543  {
1544  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1545  R phi_prod[ths->d+1]; /* postfix product of PHI */
1546  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1547  INT l[ths->d]; /* multi index u<=l<=o */
1548  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1549  R fg_psi[ths->d][2*ths->m+2];
1550  R tmpEXP1, tmp1;
1551  INT l_fg,lj_fg;
1552  INT l_L;
1553  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1554 
1555  phi_prod[0] = K(1.0);
1556  ll_plain[0] = 0;
1557 
1558  MACRO_init_uo_l_lj_t;
1559 
1560  for (t2 = 0; t2 < ths->d; t2++)
1561  {
1562  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1563  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1564  tmp1 = K(1.0);
1565  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1566  {
1567  tmp1 *= tmpEXP1;
1568  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1569  }
1570  }
1571 
1572  for (l_L= 0; l_L < lprod; l_L++)
1573  {
1574  C *lhs;
1575  R *lhs_real;
1576  C val;
1577 
1578  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1579 
1580  lhs = ths->g + ll_plain[ths->d];
1581  lhs_real = (R*)lhs;
1582  val = phi_prod[ths->d] * ths->f[j];
1583 
1584  #pragma omp atomic
1585  lhs_real[0] += CREAL(val);
1586 
1587  #pragma omp atomic
1588  lhs_real[1] += CIMAG(val);
1589 
1590  MACRO_count_uo_l_lj_t;
1591  } /* for(l_L) */
1592  } /* for(j) */
1593  return;
1594  } /* if(PRE_FG_PSI) */
1595 
1596  if (ths->flags & FG_PSI)
1597  {
1598  INT t, t2; /* index dimensions */
1599  R fg_exp_l[ths->d][2*ths->m+2];
1600 
1601  sort(ths);
1602 
1603  for (t2 = 0; t2 < ths->d; t2++)
1604  {
1605  INT lj_fg;
1606  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1607  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1608  R tmp2 = K(1.0);
1609  R tmp3 = K(1.0);
1610  fg_exp_l[t2][0] = K(1.0);
1611  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1612  {
1613  tmp3 = tmp2*tmpEXP2;
1614  tmp2 *= tmpEXP2sq;
1615  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1616  }
1617  }
1618 
1619  #pragma omp parallel for default(shared) private(k,t,t2)
1620  for (k = 0; k < ths->M_total; k++)
1621  {
1622  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1623  R phi_prod[ths->d+1]; /* postfix product of PHI */
1624  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1625  INT l[ths->d]; /* multi index u<=l<=o */
1626  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1627  R fg_psi[ths->d][2*ths->m+2];
1628  R tmpEXP1, tmp1;
1629  INT l_fg,lj_fg;
1630  INT l_L;
1631  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1632 
1633  phi_prod[0] = K(1.0);
1634  ll_plain[0] = 0;
1635 
1636  MACRO_init_uo_l_lj_t;
1637 
1638  for (t2 = 0; t2 < ths->d; t2++)
1639  {
1640  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1641 
1642  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1643  /ths->b[t2]);
1644  tmp1 = K(1.0);
1645  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1646  {
1647  tmp1 *= tmpEXP1;
1648  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1649  }
1650  }
1651 
1652  for (l_L = 0; l_L < lprod; l_L++)
1653  {
1654  C *lhs;
1655  R *lhs_real;
1656  C val;
1657 
1658  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1659 
1660  lhs = ths->g + ll_plain[ths->d];
1661  lhs_real = (R*)lhs;
1662  val = phi_prod[ths->d] * ths->f[j];
1663 
1664  #pragma omp atomic
1665  lhs_real[0] += CREAL(val);
1666 
1667  #pragma omp atomic
1668  lhs_real[1] += CIMAG(val);
1669 
1670  MACRO_count_uo_l_lj_t;
1671  } /* for(l_L) */
1672  } /* for(j) */
1673  return;
1674  } /* if(FG_PSI) */
1675 
1676  if (ths->flags & PRE_LIN_PSI)
1677  {
1678  sort(ths);
1679 
1680  #pragma omp parallel for default(shared) private(k)
1681  for (k = 0; k<ths->M_total; k++)
1682  {
1683  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1684  INT t, t2; /* index dimensions */
1685  INT l_L; /* index one row of B */
1686  INT l[ths->d]; /* multi index u<=l<=o */
1687  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1688  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1689  R phi_prod[ths->d+1]; /* postfix product of PHI */
1690  R y[ths->d];
1691  R fg_psi[ths->d][2*ths->m+2];
1692  INT l_fg,lj_fg;
1693  R ip_w;
1694  INT ip_u;
1695  INT ip_s = ths->K/(ths->m+2);
1696  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1697 
1698  phi_prod[0] = K(1.0);
1699  ll_plain[0] = 0;
1700 
1701  MACRO_init_uo_l_lj_t;
1702 
1703  for (t2 = 0; t2 < ths->d; t2++)
1704  {
1705  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1706  * ((R)ths->K))/(ths->m+2);
1707  ip_u = LRINT(FLOOR(y[t2]));
1708  ip_w = y[t2]-ip_u;
1709  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1710  {
1711  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1712  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1713  * (ip_w);
1714  }
1715  }
1716 
1717  for (l_L = 0; l_L < lprod; l_L++)
1718  {
1719  C *lhs;
1720  R *lhs_real;
1721  C val;
1722 
1723  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1724 
1725  lhs = ths->g + ll_plain[ths->d];
1726  lhs_real = (R*)lhs;
1727  val = phi_prod[ths->d] * ths->f[j];
1728 
1729  #pragma omp atomic
1730  lhs_real[0] += CREAL(val);
1731 
1732  #pragma omp atomic
1733  lhs_real[1] += CIMAG(val);
1734 
1735  MACRO_count_uo_l_lj_t;
1736  } /* for(l_L) */
1737  } /* for(j) */
1738  return;
1739  } /* if(PRE_LIN_PSI) */
1740 
1741  /* no precomputed psi at all */
1742  sort(ths);
1743 
1744  #pragma omp parallel for default(shared) private(k)
1745  for (k = 0; k < ths->M_total; k++)
1746  {
1747  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1748  INT t, t2; /* index dimensions */
1749  INT l_L; /* index one row of B */
1750  INT l[ths->d]; /* multi index u<=l<=o */
1751  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1752  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1753  R phi_prod[ths->d+1]; /* postfix product of PHI */
1754  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1755 
1756  phi_prod[0] = K(1.0);
1757  ll_plain[0] = 0;
1758 
1759  MACRO_init_uo_l_lj_t;
1760 
1761  for (l_L = 0; l_L < lprod; l_L++)
1762  {
1763  C *lhs;
1764  R *lhs_real;
1765  C val;
1766 
1767  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1768 
1769  lhs = ths->g + ll_plain[ths->d];
1770  lhs_real = (R*)lhs;
1771  val = phi_prod[ths->d] * ths->f[j];
1772 
1773  #pragma omp atomic
1774  lhs_real[0] += CREAL(val);
1775 
1776  #pragma omp atomic
1777  lhs_real[1] += CIMAG(val);
1778 
1779  MACRO_count_uo_l_lj_t;
1780  } /* for(l_L) */
1781  } /* for(j) */
1782 }
1783 #endif
1784 
1785 static void B_T(X(plan) *ths)
1786 {
1787 #ifdef _OPENMP
1788  B_openmp_T(ths);
1789 #else
1790  B_serial_T(ths);
1791 #endif
1792 }
1793 
1794 /* ## specialized version for d=1 ########################################### */
1795 
1796 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
1797 {
1798  const INT tmp2 = 2*m+2;
1799  INT l;
1800  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1801 
1802  fg_exp_b0 = EXP(K(-1.0)/b);
1803  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1804  fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1805 
1806  for (l = 1; l < tmp2; l++)
1807  {
1808  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1809  fg_exp_b1 *= fg_exp_b0_sq;
1810  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1811  }
1812 }
1813 
1814 
1815 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
1816  const R *xj, const INT n, const INT m)
1817 {
1818  INT u, o, l;
1819  const C *gj;
1820  const R *psij;
1821  psij = psij_const;
1822 
1823  uo2(&u, &o, *xj, n, m);
1824 
1825  if (u < o)
1826  {
1827  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1828  (*fj) += (*psij++) * (*gj++);
1829  }
1830  else
1831  {
1832  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1833  (*fj) += (*psij++) * (*gj++);
1834  for (l = 0, gj = g; l <= o; l++)
1835  (*fj) += (*psij++) * (*gj++);
1836  }
1837 }
1838 
1839 #ifndef _OPENMP
1840 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,
1841  const R *psij_const, const R *xj, const INT n, const INT m)
1842 {
1843  INT u,o,l;
1844  C *gj;
1845  const R *psij;
1846  psij = psij_const;
1847 
1848  uo2(&u,&o,*xj, n, m);
1849 
1850  if (u < o)
1851  {
1852  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1853  (*gj++) += (*psij++) * (*fj);
1854  }
1855  else
1856  {
1857  for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1858  (*gj++) += (*psij++) * (*fj);
1859  for (l = 0, gj = g; l <= o; l++)
1860  (*gj++) += (*psij++) * (*fj);
1861  }
1862 }
1863 #endif
1864 
1865 #ifdef _OPENMP
1866 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
1867 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,
1868  const R *psij_const, const R *xj, const INT n, const INT m)
1869 {
1870  INT u,o,l;
1871  C *gj;
1872  INT index_temp[2*m+2];
1873 
1874  uo2(&u,&o,*xj, n, m);
1875 
1876  for (l=0; l<=2*m+1; l++)
1877  index_temp[l] = (l+u)%n;
1878 
1879  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1880  {
1881  INT i = index_temp[l];
1882  C *lhs = g+i;
1883  R *lhs_real = (R*)lhs;
1884  C val = psij_const[l] * f;
1885  #pragma omp atomic
1886  lhs_real[0] += creal(val);
1887 
1888  #pragma omp atomic
1889  lhs_real[1] += cimag(val);
1890  }
1891 }
1892 #endif
1893 
1894 #ifdef _OPENMP
1895 
1910 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,
1911  const R *psij_const, const R *xj, const INT n, const INT m,
1912  const INT my_u0, const INT my_o0)
1913 {
1914  INT ar_u,ar_o,l;
1915 
1916  uo2(&ar_u,&ar_o,*xj, n, m);
1917 
1918  if (ar_u < ar_o)
1919  {
1920  INT u = MAX(my_u0,ar_u);
1921  INT o = MIN(my_o0,ar_o);
1922  INT offset_psij = u-ar_u;
1923 #ifdef OMP_ASSERT
1924  assert(offset_psij >= 0);
1925  assert(o-u <= 2*m+1);
1926  assert(offset_psij+o-u <= 2*m+1);
1927 #endif
1928 
1929  for (l = 0; l <= o-u; l++)
1930  g[u+l] += psij_const[offset_psij+l] * f;
1931  }
1932  else
1933  {
1934  INT u = MAX(my_u0,ar_u);
1935  INT o = my_o0;
1936  INT offset_psij = u-ar_u;
1937 #ifdef OMP_ASSERT
1938  assert(offset_psij >= 0);
1939  assert(o-u <= 2*m+1);
1940  assert(offset_psij+o-u <= 2*m+1);
1941 #endif
1942 
1943  for (l = 0; l <= o-u; l++)
1944  g[u+l] += psij_const[offset_psij+l] * f;
1945 
1946  u = my_u0;
1947  o = MIN(my_o0,ar_o);
1948  offset_psij += my_u0-ar_u+n;
1949 
1950 #ifdef OMP_ASSERT
1951  if (u <= o)
1952  {
1953  assert(o-u <= 2*m+1);
1954  if (offset_psij+o-u > 2*m+1)
1955  {
1956  fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
1957  }
1958  assert(offset_psij+o-u <= 2*m+1);
1959  }
1960 #endif
1961  for (l = 0; l <= o-u; l++)
1962  g[u+l] += psij_const[offset_psij+l] * f;
1963  }
1964 }
1965 #endif
1966 
1967 static void nfft_trafo_1d_B(X(plan) *ths)
1968 {
1969  const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
1970  const C *g = (C*)ths->g;
1971 
1972  if (ths->flags & PRE_FULL_PSI)
1973  {
1974  INT k;
1975 #ifdef _OPENMP
1976  #pragma omp parallel for default(shared) private(k)
1977 #endif
1978  for (k = 0; k < M; k++)
1979  {
1980  INT l;
1981  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1982  ths->f[j] = K(0.0);
1983  for (l = 0; l < m2p2; l++)
1984  ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
1985  }
1986  return;
1987  } /* if(PRE_FULL_PSI) */
1988 
1989  if (ths->flags & PRE_PSI)
1990  {
1991  INT k;
1992 #ifdef _OPENMP
1993  #pragma omp parallel for default(shared) private(k)
1994 #endif
1995  for (k = 0; k < M; k++)
1996  {
1997  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1998  nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
1999  &ths->x[j], n, m);
2000  }
2001  return;
2002  } /* if(PRE_PSI) */
2003 
2004  if (ths->flags & PRE_FG_PSI)
2005  {
2006  INT k;
2007  R fg_exp_l[m2p2];
2008 
2009  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2010 
2011 #ifdef _OPENMP
2012  #pragma omp parallel for default(shared) private(k)
2013 #endif
2014  for (k = 0; k < M; k++)
2015  {
2016  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2017  const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2018  R fg_psij2 = K(1.0);
2019  R psij_const[m2p2];
2020  INT l;
2021 
2022  psij_const[0] = fg_psij0;
2023 
2024  for (l = 1; l < m2p2; l++)
2025  {
2026  fg_psij2 *= fg_psij1;
2027  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2028  }
2029 
2030  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2031  }
2032 
2033  return;
2034  } /* if(PRE_FG_PSI) */
2035 
2036  if (ths->flags & FG_PSI)
2037  {
2038  INT k;
2039  R fg_exp_l[m2p2];
2040 
2041  sort(ths);
2042 
2043  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2044 
2045 #ifdef _OPENMP
2046  #pragma omp parallel for default(shared) private(k)
2047 #endif
2048  for (k = 0; k < M; k++)
2049  {
2050  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2051  INT u, o, l;
2052  R fg_psij0, fg_psij1, fg_psij2;
2053  R psij_const[m2p2];
2054 
2055  uo(ths, (INT)j, &u, &o, (INT)0);
2056  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2057  fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2058  fg_psij2 = K(1.0);
2059 
2060  psij_const[0] = fg_psij0;
2061 
2062  for (l = 1; l < m2p2; l++)
2063  {
2064  fg_psij2 *= fg_psij1;
2065  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2066  }
2067 
2068  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2069  }
2070  return;
2071  } /* if(FG_PSI) */
2072 
2073  if (ths->flags & PRE_LIN_PSI)
2074  {
2075  const INT K = ths->K, ip_s = K / (m + 2);
2076  INT k;
2077 
2078  sort(ths);
2079 
2080 #ifdef _OPENMP
2081  #pragma omp parallel for default(shared) private(k)
2082 #endif
2083  for (k = 0; k < M; k++)
2084  {
2085  INT u, o, l;
2086  R ip_y, ip_w;
2087  INT ip_u;
2088  R psij_const[m2p2];
2089  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2090 
2091  uo(ths, (INT)j, &u, &o, (INT)0);
2092 
2093  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2094  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2095  ip_w = ip_y - (R)(ip_u);
2096 
2097  for (l = 0; l < m2p2; l++)
2098  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2099  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2100 
2101  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2102  }
2103  return;
2104  } /* if(PRE_LIN_PSI) */
2105  else
2106  {
2107  /* no precomputed psi at all */
2108  INT k;
2109 
2110  sort(ths);
2111 
2112 #ifdef _OPENMP
2113  #pragma omp parallel for default(shared) private(k)
2114 #endif
2115  for (k = 0; k < M; k++)
2116  {
2117  R psij_const[m2p2];
2118  INT u, o, l;
2119  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2120 
2121  uo(ths, (INT)j, &u, &o, (INT)0);
2122 
2123  for (l = 0; l < m2p2; l++)
2124  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2125 
2126  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2127  }
2128  }
2129 }
2130 
2131 
2132 #ifdef OMP_ASSERT
2133 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2134 { \
2135  assert(ar_x[2*k] >= min_u_a || k == M-1); \
2136  if (k > 0) \
2137  assert(ar_x[2*k-2] < min_u_a); \
2138 }
2139 #else
2140 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2141 #endif
2142 
2143 #ifdef OMP_ASSERT
2144 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2145 { \
2146  assert(ar_x[2*k] >= min_u_b || k == M-1); \
2147  if (k > 0) \
2148  assert(ar_x[2*k-2] < min_u_b); \
2149 }
2150 #else
2151 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2152 #endif
2153 
2154 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2155 { \
2156  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2157  ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2158 }
2159 
2160 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2161 { \
2162  R psij_const[2 * m + 2]; \
2163  INT u, o, l; \
2164  R fg_psij0 = ths->psi[2 * j]; \
2165  R fg_psij1 = ths->psi[2 * j + 1]; \
2166  R fg_psij2 = K(1.0); \
2167  \
2168  psij_const[0] = fg_psij0; \
2169  for (l = 1; l <= 2 * m + 1; l++) \
2170  { \
2171  fg_psij2 *= fg_psij1; \
2172  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2173  } \
2174  \
2175  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2176  ths->x + j, n, m, my_u0, my_o0); \
2177 }
2178 
2179 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2180 { \
2181  R psij_const[2 * m + 2]; \
2182  R fg_psij0, fg_psij1, fg_psij2; \
2183  INT u, o, l; \
2184  \
2185  uo(ths, j, &u, &o, (INT)0); \
2186  fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
2187  fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2188  fg_psij2 = K(1.0); \
2189  psij_const[0] = fg_psij0; \
2190  for (l = 1; l <= 2 * m + 1; l++) \
2191  { \
2192  fg_psij2 *= fg_psij1; \
2193  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2194  } \
2195  \
2196  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2197  ths->x + j, n, m, my_u0, my_o0); \
2198 }
2199 
2200 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2201 { \
2202  R psij_const[2 * m + 2]; \
2203  INT ip_u; \
2204  R ip_y, ip_w; \
2205  INT u, o, l; \
2206  \
2207  uo(ths, j, &u, &o, (INT)0); \
2208  \
2209  ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2210  ip_u = LRINT(FLOOR(ip_y)); \
2211  ip_w = ip_y - ip_u; \
2212  for (l = 0; l < 2 * m + 2; l++) \
2213  psij_const[l] \
2214  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2215  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2216  \
2217  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2218  ths->x + j, n, m, my_u0, my_o0); \
2219 }
2220 
2221 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2222 { \
2223  R psij_const[2 * m + 2]; \
2224  INT u, o, l; \
2225  \
2226  uo(ths, j, &u, &o, (INT)0); \
2227  \
2228  for (l = 0; l <= 2 * m + 1; l++) \
2229  psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
2230  \
2231  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2232  ths->x + j, n, m, my_u0, my_o0); \
2233 }
2234 
2235 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2236 { \
2237  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2238  { \
2239  _Pragma("omp parallel private(k)") \
2240  { \
2241  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2242  INT *ar_x = ths->index_x; \
2243  \
2244  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2245  &min_u_b, &max_u_b, 1, &n, m); \
2246  \
2247  if (min_u_a != -1) \
2248  { \
2249  k = index_x_binary_search(ar_x, M, min_u_a); \
2250  \
2251  MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2252  \
2253  while (k < M) \
2254  { \
2255  INT u_prod = ar_x[2*k]; \
2256  INT j = ar_x[2*k+1]; \
2257  \
2258  if (u_prod < min_u_a || u_prod > max_u_a) \
2259  break; \
2260  \
2261  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2262  \
2263  k++; \
2264  } \
2265  } \
2266  \
2267  if (min_u_b != -1) \
2268  { \
2269  k = index_x_binary_search(ar_x, M, min_u_b); \
2270  \
2271  MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2272  \
2273  while (k < M) \
2274  { \
2275  INT u_prod = ar_x[2*k]; \
2276  INT j = ar_x[2*k+1]; \
2277  \
2278  if (u_prod < min_u_b || u_prod > max_u_b) \
2279  break; \
2280  \
2281  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2282  \
2283  k++; \
2284  } \
2285  } \
2286  } /* omp parallel */ \
2287  return; \
2288  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
2289 }
2290 
2291 static void nfft_adjoint_1d_B(X(plan) *ths)
2292 {
2293  const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2294  INT k;
2295  C *g = (C*)ths->g;
2296 
2297  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
2298 
2299  if (ths->flags & PRE_FULL_PSI)
2300  {
2301  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2302  (INT)1, ths->n, m, ths->flags, ths->index_x);
2303  return;
2304  } /* if(PRE_FULL_PSI) */
2305 
2306  if (ths->flags & PRE_PSI)
2307  {
2308 #ifdef _OPENMP
2309  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2310 #endif
2311 
2312 #ifdef _OPENMP
2313  #pragma omp parallel for default(shared) private(k)
2314 #endif
2315  for (k = 0; k < M; k++)
2316  {
2317  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2318 #ifdef _OPENMP
2319  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2320 #else
2321  nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2322 #endif
2323  }
2324 
2325  return;
2326  } /* if(PRE_PSI) */
2327 
2328  if (ths->flags & PRE_FG_PSI)
2329  {
2330  R fg_exp_l[2 * m + 2];
2331 
2332  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2333 
2334 #ifdef _OPENMP
2335  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2336 #endif
2337 
2338 
2339 #ifdef _OPENMP
2340  #pragma omp parallel for default(shared) private(k)
2341 #endif
2342  for (k = 0; k < M; k++)
2343  {
2344  R psij_const[2 * m + 2];
2345  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2346  INT l;
2347  R fg_psij0 = ths->psi[2 * j];
2348  R fg_psij1 = ths->psi[2 * j + 1];
2349  R fg_psij2 = K(1.0);
2350 
2351  psij_const[0] = fg_psij0;
2352  for (l = 1; l <= 2 * m + 1; l++)
2353  {
2354  fg_psij2 *= fg_psij1;
2355  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2356  }
2357 
2358 #ifdef _OPENMP
2359  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2360 #else
2361  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2362 #endif
2363  }
2364 
2365  return;
2366  } /* if(PRE_FG_PSI) */
2367 
2368  if (ths->flags & FG_PSI)
2369  {
2370  R fg_exp_l[2 * m + 2];
2371 
2372  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2373 
2374  sort(ths);
2375 
2376 #ifdef _OPENMP
2377  MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2378 #endif
2379 
2380 #ifdef _OPENMP
2381  #pragma omp parallel for default(shared) private(k)
2382 #endif
2383  for (k = 0; k < M; k++)
2384  {
2385  INT u,o,l;
2386  R psij_const[2 * m + 2];
2387  R fg_psij0, fg_psij1, fg_psij2;
2388  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2389 
2390  uo(ths, j, &u, &o, (INT)0);
2391  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2392  fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2393  fg_psij2 = K(1.0);
2394  psij_const[0] = fg_psij0;
2395  for (l = 1; l <= 2 * m + 1; l++)
2396  {
2397  fg_psij2 *= fg_psij1;
2398  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2399  }
2400 
2401 #ifdef _OPENMP
2402  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2403 #else
2404  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2405 #endif
2406  }
2407 
2408  return;
2409  } /* if(FG_PSI) */
2410 
2411  if (ths->flags & PRE_LIN_PSI)
2412  {
2413  const INT K = ths->K;
2414  const INT ip_s = K / (m + 2);
2415 
2416  sort(ths);
2417 
2418 #ifdef _OPENMP
2419  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2420 #endif
2421 
2422 #ifdef _OPENMP
2423  #pragma omp parallel for default(shared) private(k)
2424 #endif
2425  for (k = 0; k < M; k++)
2426  {
2427  INT u,o,l;
2428  INT ip_u;
2429  R ip_y, ip_w;
2430  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2431  R psij_const[2 * m + 2];
2432 
2433  uo(ths, j, &u, &o, (INT)0);
2434 
2435  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2436  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2437  ip_w = ip_y - (R)(ip_u);
2438  for (l = 0; l < 2 * m + 2; l++)
2439  psij_const[l]
2440  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2441  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2442 
2443 #ifdef _OPENMP
2444  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2445 #else
2446  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2447 #endif
2448  }
2449  return;
2450  } /* if(PRE_LIN_PSI) */
2451 
2452  /* no precomputed psi at all */
2453  sort(ths);
2454 
2455 #ifdef _OPENMP
2456  MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2457 #endif
2458 
2459 #ifdef _OPENMP
2460  #pragma omp parallel for default(shared) private(k)
2461 #endif
2462  for (k = 0; k < M; k++)
2463  {
2464  INT u,o,l;
2465  R psij_const[2 * m + 2];
2466  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2467 
2468  uo(ths, j, &u, &o, (INT)0);
2469 
2470  for (l = 0; l <= 2 * m + 1; l++)
2471  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2472 
2473 #ifdef _OPENMP
2474  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2475 #else
2476  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2477 #endif
2478  }
2479 }
2480 
2481 void X(trafo_1d)(X(plan) *ths)
2482 {
2483  const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2484  C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2485 
2486  ths->g_hat = ths->g1;
2487  ths->g = ths->g2;
2488 
2489  {
2490  C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2491  R *c_phi_inv1, *c_phi_inv2;
2492 
2493  TIC(0)
2494 #ifdef _OPENMP
2495  {
2496  INT k;
2497  #pragma omp parallel for default(shared) private(k)
2498  for (k = 0; k < ths->n_total; k++)
2499  ths->g_hat[k] = 0.0;
2500  }
2501 #else
2502  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
2503 #endif
2504  if(ths->flags & PRE_PHI_HUT)
2505  {
2506  INT k;
2507  c_phi_inv1 = ths->c_phi_inv[0];
2508  c_phi_inv2 = &ths->c_phi_inv[0][N2];
2509 
2510 #ifdef _OPENMP
2511  #pragma omp parallel for default(shared) private(k)
2512 #endif
2513  for (k = 0; k < N2; k++)
2514  {
2515  g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2516  g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2517  }
2518  }
2519  else
2520  {
2521  INT k;
2522 #ifdef _OPENMP
2523  #pragma omp parallel for default(shared) private(k)
2524 #endif
2525  for (k = 0; k < N2; k++)
2526  {
2527  g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2528  g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2529  }
2530  }
2531  TOC(0)
2532 
2533  TIC_FFTW(1)
2534  FFTW(execute)(ths->my_fftw_plan1);
2535  TOC_FFTW(1);
2536 
2537  TIC(2);
2538  nfft_trafo_1d_B(ths);
2539  TOC(2);
2540  }
2541 }
2542 
2543 void X(adjoint_1d)(X(plan) *ths)
2544 {
2545  INT n,N;
2546  C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2547  R *c_phi_inv1, *c_phi_inv2;
2548 
2549  N=ths->N[0];
2550  n=ths->n[0];
2551 
2552  ths->g_hat=ths->g1;
2553  ths->g=ths->g2;
2554 
2555  f_hat1=(C*)ths->f_hat;
2556  f_hat2=(C*)&ths->f_hat[N/2];
2557  g_hat1=(C*)&ths->g_hat[n-N/2];
2558  g_hat2=(C*)ths->g_hat;
2559 
2560  TIC(2)
2561  nfft_adjoint_1d_B(ths);
2562  TOC(2)
2563 
2564  TIC_FFTW(1)
2565  FFTW(execute)(ths->my_fftw_plan2);
2566  TOC_FFTW(1);
2567 
2568  TIC(0)
2569  if(ths->flags & PRE_PHI_HUT)
2570  {
2571  INT k;
2572  c_phi_inv1=ths->c_phi_inv[0];
2573  c_phi_inv2=&ths->c_phi_inv[0][N/2];
2574 
2575 #ifdef _OPENMP
2576  #pragma omp parallel for default(shared) private(k)
2577 #endif
2578  for (k = 0; k < N/2; k++)
2579  {
2580  f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2581  f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2582  }
2583  }
2584  else
2585  {
2586  INT k;
2587 
2588 #ifdef _OPENMP
2589  #pragma omp parallel for default(shared) private(k)
2590 #endif
2591  for (k = 0; k < N/2; k++)
2592  {
2593  f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2594  f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2595  }
2596  }
2597  TOC(0)
2598 }
2599 
2600 
2601 /* ################################################ SPECIFIC VERSIONS FOR d=2 */
2602 
2603 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2604 {
2605  INT l;
2606  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2607 
2608  fg_exp_b0 = EXP(K(-1.0)/b);
2609  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2610  fg_exp_b1 = K(1.0);
2611  fg_exp_b2 = K(1.0);
2612  fg_exp_l[0] = K(1.0);
2613  for(l=1; l <= 2*m+1; l++)
2614  {
2615  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2616  fg_exp_b1 *= fg_exp_b0_sq;
2617  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2618  }
2619 }
2620 
2621 static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0,
2622  const R *psij_const1, const R *xj0, const R *xj1, const INT n0,
2623  const INT n1, const INT m)
2624 {
2625  INT u0,o0,l0,u1,o1,l1;
2626  const C *gj;
2627  const R *psij0,*psij1;
2628 
2629  psij0=psij_const0;
2630  psij1=psij_const1;
2631 
2632  uo2(&u0,&o0,*xj0, n0, m);
2633  uo2(&u1,&o1,*xj1, n1, m);
2634 
2635  *fj=0;
2636 
2637  if (u0 < o0)
2638  if(u1 < o1)
2639  for(l0=0; l0<=2*m+1; l0++,psij0++)
2640  {
2641  psij1=psij_const1;
2642  gj=g+(u0+l0)*n1+u1;
2643  for(l1=0; l1<=2*m+1; l1++)
2644  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2645  }
2646  else
2647  for(l0=0; l0<=2*m+1; l0++,psij0++)
2648  {
2649  psij1=psij_const1;
2650  gj=g+(u0+l0)*n1+u1;
2651  for(l1=0; l1<2*m+1-o1; l1++)
2652  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2653  gj=g+(u0+l0)*n1;
2654  for(l1=0; l1<=o1; l1++)
2655  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2656  }
2657  else
2658  if(u1<o1)
2659  {
2660  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2661  {
2662  psij1=psij_const1;
2663  gj=g+(u0+l0)*n1+u1;
2664  for(l1=0; l1<=2*m+1; l1++)
2665  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2666  }
2667  for(l0=0; l0<=o0; l0++,psij0++)
2668  {
2669  psij1=psij_const1;
2670  gj=g+l0*n1+u1;
2671  for(l1=0; l1<=2*m+1; l1++)
2672  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2673  }
2674  }
2675  else
2676  {
2677  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2678  {
2679  psij1=psij_const1;
2680  gj=g+(u0+l0)*n1+u1;
2681  for(l1=0; l1<2*m+1-o1; l1++)
2682  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2683  gj=g+(u0+l0)*n1;
2684  for(l1=0; l1<=o1; l1++)
2685  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2686  }
2687  for(l0=0; l0<=o0; l0++,psij0++)
2688  {
2689  psij1=psij_const1;
2690  gj=g+l0*n1+u1;
2691  for(l1=0; l1<2*m+1-o1; l1++)
2692  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2693  gj=g+l0*n1;
2694  for(l1=0; l1<=o1; l1++)
2695  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2696  }
2697  }
2698 }
2699 
2700 #ifdef _OPENMP
2701 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
2702 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
2703  const R *psij_const0, const R *psij_const1, const R *xj0,
2704  const R *xj1, const INT n0, const INT n1, const INT m)
2705 {
2706  INT u0,o0,l0,u1,o1,l1;
2707  const INT lprod = (2*m+2) * (2*m+2);
2708 
2709  INT index_temp0[2*m+2];
2710  INT index_temp1[2*m+2];
2711 
2712  uo2(&u0,&o0,*xj0, n0, m);
2713  uo2(&u1,&o1,*xj1, n1, m);
2714 
2715  for (l0=0; l0<=2*m+1; l0++)
2716  index_temp0[l0] = (u0+l0)%n0;
2717 
2718  for (l1=0; l1<=2*m+1; l1++)
2719  index_temp1[l1] = (u1+l1)%n1;
2720 
2721  for(l0=0; l0<=2*m+1; l0++)
2722  {
2723  for(l1=0; l1<=2*m+1; l1++)
2724  {
2725  INT i = index_temp0[l0] * n1 + index_temp1[l1];
2726  C *lhs = g+i;
2727  R *lhs_real = (R*)lhs;
2728  C val = psij_const0[l0] * psij_const1[l1] * f;
2729 
2730  #pragma omp atomic
2731  lhs_real[0] += creal(val);
2732 
2733  #pragma omp atomic
2734  lhs_real[1] += cimag(val);
2735  }
2736  }
2737 }
2738 #endif
2739 
2740 #ifdef _OPENMP
2741 
2759 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
2760  const R *psij_const0, const R *psij_const1, const R *xj0,
2761  const R *xj1, const INT n0, const INT n1, const INT m,
2762  const INT my_u0, const INT my_o0)
2763 {
2764  INT ar_u0,ar_o0,l0,u1,o1,l1;
2765  const INT lprod = (2*m+2) * (2*m+2);
2766  INT index_temp1[2*m+2];
2767 
2768  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2769  uo2(&u1,&o1,*xj1, n1, m);
2770 
2771  for (l1 = 0; l1 <= 2*m+1; l1++)
2772  index_temp1[l1] = (u1+l1)%n1;
2773 
2774  if(ar_u0 < ar_o0)
2775  {
2776  INT u0 = MAX(my_u0,ar_u0);
2777  INT o0 = MIN(my_o0,ar_o0);
2778  INT offset_psij = u0-ar_u0;
2779 #ifdef OMP_ASSERT
2780  assert(offset_psij >= 0);
2781  assert(o0-u0 <= 2*m+1);
2782  assert(offset_psij+o0-u0 <= 2*m+1);
2783 #endif
2784 
2785  for (l0 = 0; l0 <= o0-u0; l0++)
2786  {
2787  INT i0 = (u0+l0) * n1;
2788  const C val0 = psij_const0[offset_psij+l0];
2789 
2790  for(l1=0; l1<=2*m+1; l1++)
2791  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2792  }
2793  }
2794  else
2795  {
2796  INT u0 = MAX(my_u0,ar_u0);
2797  INT o0 = my_o0;
2798  INT offset_psij = u0-ar_u0;
2799 #ifdef OMP_ASSERT
2800  assert(offset_psij >= 0);
2801  assert(o0-u0 <= 2*m+1);
2802  assert(offset_psij+o0-u0 <= 2*m+1);
2803 #endif
2804 
2805  for (l0 = 0; l0 <= o0-u0; l0++)
2806  {
2807  INT i0 = (u0+l0) * n1;
2808  const C val0 = psij_const0[offset_psij+l0];
2809 
2810  for(l1=0; l1<=2*m+1; l1++)
2811  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2812  }
2813 
2814  u0 = my_u0;
2815  o0 = MIN(my_o0,ar_o0);
2816  offset_psij += my_u0-ar_u0+n0;
2817 
2818 #ifdef OMP_ASSERT
2819  if (u0<=o0)
2820  {
2821  assert(o0-u0 <= 2*m+1);
2822  assert(offset_psij+o0-u0 <= 2*m+1);
2823  }
2824 #endif
2825 
2826  for (l0 = 0; l0 <= o0-u0; l0++)
2827  {
2828  INT i0 = (u0+l0) * n1;
2829  const C val0 = psij_const0[offset_psij+l0];
2830 
2831  for(l1=0; l1<=2*m+1; l1++)
2832  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2833  }
2834  }
2835 }
2836 #endif
2837 
2838 #ifndef _OPENMP
2839 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
2840  const R *psij_const0, const R *psij_const1, const R *xj0,
2841  const R *xj1, const INT n0, const INT n1, const INT m)
2842 {
2843  INT u0,o0,l0,u1,o1,l1;
2844  C *gj;
2845  const R *psij0,*psij1;
2846 
2847  psij0=psij_const0;
2848  psij1=psij_const1;
2849 
2850  uo2(&u0,&o0,*xj0, n0, m);
2851  uo2(&u1,&o1,*xj1, n1, m);
2852 
2853  if(u0<o0)
2854  if(u1<o1)
2855  for(l0=0; l0<=2*m+1; l0++,psij0++)
2856  {
2857  psij1=psij_const1;
2858  gj=g+(u0+l0)*n1+u1;
2859  for(l1=0; l1<=2*m+1; l1++)
2860  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2861  }
2862  else
2863  for(l0=0; l0<=2*m+1; l0++,psij0++)
2864  {
2865  psij1=psij_const1;
2866  gj=g+(u0+l0)*n1+u1;
2867  for(l1=0; l1<2*m+1-o1; l1++)
2868  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2869  gj=g+(u0+l0)*n1;
2870  for(l1=0; l1<=o1; l1++)
2871  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2872  }
2873  else
2874  if(u1<o1)
2875  {
2876  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2877  {
2878  psij1=psij_const1;
2879  gj=g+(u0+l0)*n1+u1;
2880  for(l1=0; l1<=2*m+1; l1++)
2881  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2882  }
2883  for(l0=0; l0<=o0; l0++,psij0++)
2884  {
2885  psij1=psij_const1;
2886  gj=g+l0*n1+u1;
2887  for(l1=0; l1<=2*m+1; l1++)
2888  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2889  }
2890  }
2891  else
2892  {
2893  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2894  {
2895  psij1=psij_const1;
2896  gj=g+(u0+l0)*n1+u1;
2897  for(l1=0; l1<2*m+1-o1; l1++)
2898  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2899  gj=g+(u0+l0)*n1;
2900  for(l1=0; l1<=o1; l1++)
2901  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2902  }
2903  for(l0=0; l0<=o0; l0++,psij0++)
2904  {
2905  psij1=psij_const1;
2906  gj=g+l0*n1+u1;
2907  for(l1=0; l1<2*m+1-o1; l1++)
2908  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2909  gj=g+l0*n1;
2910  for(l1=0; l1<=o1; l1++)
2911  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2912  }
2913  }
2914 }
2915 #endif
2916 
2917 static void nfft_trafo_2d_B(X(plan) *ths)
2918 {
2919  const C *g = (C*)ths->g;
2920  const INT n0 = ths->n[0];
2921  const INT n1 = ths->n[1];
2922  const INT M = ths->M_total;
2923  const INT m = ths->m;
2924 
2925  INT k;
2926 
2927  if(ths->flags & PRE_FULL_PSI)
2928  {
2929  const INT lprod = (2*m+2) * (2*m+2);
2930 #ifdef _OPENMP
2931  #pragma omp parallel for default(shared) private(k)
2932 #endif
2933  for (k = 0; k < M; k++)
2934  {
2935  INT l;
2936  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2937  ths->f[j] = K(0.0);
2938  for (l = 0; l < lprod; l++)
2939  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2940  }
2941  return;
2942  } /* if(PRE_FULL_PSI) */
2943 
2944  if(ths->flags & PRE_PSI)
2945  {
2946 #ifdef _OPENMP
2947  #pragma omp parallel for default(shared) private(k)
2948 #endif
2949  for (k = 0; k < M; k++)
2950  {
2951  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2952  nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2953  }
2954 
2955  return;
2956  } /* if(PRE_PSI) */
2957 
2958  if(ths->flags & PRE_FG_PSI)
2959  {
2960  R fg_exp_l[2*(2*m+2)];
2961 
2962  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2963  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2964 
2965 #ifdef _OPENMP
2966  #pragma omp parallel for default(shared) private(k)
2967 #endif
2968  for (k = 0; k < M; k++)
2969  {
2970  R psij_const[2*(2*m+2)];
2971  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2972  INT l;
2973  R fg_psij0 = ths->psi[2*j*2];
2974  R fg_psij1 = ths->psi[2*j*2+1];
2975  R fg_psij2 = K(1.0);
2976 
2977  psij_const[0] = fg_psij0;
2978  for (l = 1; l <= 2*m+1; l++)
2979  {
2980  fg_psij2 *= fg_psij1;
2981  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
2982  }
2983 
2984  fg_psij0 = ths->psi[2*(j*2+1)];
2985  fg_psij1 = ths->psi[2*(j*2+1)+1];
2986  fg_psij2 = K(1.0);
2987  psij_const[2*m+2] = fg_psij0;
2988  for (l = 1; l <= 2*m+1; l++)
2989  {
2990  fg_psij2 *= fg_psij1;
2991  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
2992  }
2993 
2994  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2995  }
2996 
2997  return;
2998  } /* if(PRE_FG_PSI) */
2999 
3000  if(ths->flags & FG_PSI)
3001  {
3002  R fg_exp_l[2*(2*m+2)];
3003 
3004  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3005  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3006 
3007  sort(ths);
3008 
3009 #ifdef _OPENMP
3010  #pragma omp parallel for default(shared) private(k)
3011 #endif
3012  for (k = 0; k < M; k++)
3013  {
3014  INT u, o, l;
3015  R fg_psij0, fg_psij1, fg_psij2;
3016  R psij_const[2*(2*m+2)];
3017  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3018 
3019  uo(ths, j, &u, &o, (INT)0);
3020  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3021  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3022  fg_psij2 = K(1.0);
3023  psij_const[0] = fg_psij0;
3024  for (l = 1; l <= 2*m+1; l++)
3025  {
3026  fg_psij2 *= fg_psij1;
3027  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3028  }
3029 
3030  uo(ths,j,&u,&o, (INT)1);
3031  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3032  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3033  fg_psij2 = K(1.0);
3034  psij_const[2*m+2] = fg_psij0;
3035  for(l=1; l<=2*m+1; l++)
3036  {
3037  fg_psij2 *= fg_psij1;
3038  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3039  }
3040 
3041  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3042  }
3043 
3044  return;
3045  } /* if(FG_PSI) */
3046 
3047  if(ths->flags & PRE_LIN_PSI)
3048  {
3049  const INT K = ths->K, ip_s = K / (m + 2);
3050 
3051  sort(ths);
3052 
3053 #ifdef _OPENMP
3054  #pragma omp parallel for default(shared) private(k)
3055 #endif
3056  for (k = 0; k < M; k++)
3057  {
3058  INT u, o, l;
3059  R ip_y, ip_w;
3060  INT ip_u;
3061  R psij_const[2*(2*m+2)];
3062  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3063 
3064  uo(ths,j,&u,&o,(INT)0);
3065  ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3066  ip_u = (INT)LRINT(FLOOR(ip_y));
3067  ip_w = ip_y - (R)(ip_u);
3068  for (l = 0; l < 2*m+2; l++)
3069  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3070 
3071  uo(ths,j,&u,&o,(INT)1);
3072  ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3073  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3074  ip_w = ip_y - (R)(ip_u);
3075  for (l = 0; l < 2*m+2; l++)
3076  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3077 
3078  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3079  }
3080  return;
3081  } /* if(PRE_LIN_PSI) */
3082 
3083  /* no precomputed psi at all */
3084 
3085  sort(ths);
3086 
3087 #ifdef _OPENMP
3088  #pragma omp parallel for default(shared) private(k)
3089 #endif
3090  for (k = 0; k < M; k++)
3091  {
3092  R psij_const[2*(2*m+2)];
3093  INT u, o, l;
3094  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3095 
3096  uo(ths,j,&u,&o,(INT)0);
3097  for (l = 0; l <= 2*m+1; l++)
3098  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3099 
3100  uo(ths,j,&u,&o,(INT)1);
3101  for (l = 0; l <= 2*m+1; l++)
3102  psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3103 
3104  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3105  }
3106 }
3107 
3108 #ifdef OMP_ASSERT
3109 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3110 { \
3111  assert(ar_x[2*k] >= min_u_a || k == M-1); \
3112  if (k > 0) \
3113  assert(ar_x[2*k-2] < min_u_a); \
3114 }
3115 #else
3116 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3117 #endif
3118 
3119 #ifdef OMP_ASSERT
3120 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3121 { \
3122  assert(ar_x[2*k] >= min_u_b || k == M-1); \
3123  if (k > 0) \
3124  assert(ar_x[2*k-2] < min_u_b); \
3125 }
3126 #else
3127 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3128 #endif
3129 
3130 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3131  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3132  ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3133  ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3134 
3135 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3136 { \
3137  R psij_const[2*(2*m+2)]; \
3138  INT u, o, l; \
3139  R fg_psij0 = ths->psi[2*j*2]; \
3140  R fg_psij1 = ths->psi[2*j*2+1]; \
3141  R fg_psij2 = K(1.0); \
3142  \
3143  psij_const[0] = fg_psij0; \
3144  for(l=1; l<=2*m+1; l++) \
3145  { \
3146  fg_psij2 *= fg_psij1; \
3147  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3148  } \
3149  \
3150  fg_psij0 = ths->psi[2*(j*2+1)]; \
3151  fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3152  fg_psij2 = K(1.0); \
3153  psij_const[2*m+2] = fg_psij0; \
3154  for(l=1; l<=2*m+1; l++) \
3155  { \
3156  fg_psij2 *= fg_psij1; \
3157  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3158  } \
3159  \
3160  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3161  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3162  n0, n1, m, my_u0, my_o0); \
3163 }
3164 
3165 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3166 { \
3167  R psij_const[2*(2*m+2)]; \
3168  R fg_psij0, fg_psij1, fg_psij2; \
3169  INT u, o, l; \
3170  \
3171  uo(ths,j,&u,&o,(INT)0); \
3172  fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
3173  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3174  fg_psij2 = K(1.0); \
3175  psij_const[0] = fg_psij0; \
3176  for(l=1; l<=2*m+1; l++) \
3177  { \
3178  fg_psij2 *= fg_psij1; \
3179  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3180  } \
3181  \
3182  uo(ths,j,&u,&o,(INT)1); \
3183  fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
3184  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3185  fg_psij2 = K(1.0); \
3186  psij_const[2*m+2] = fg_psij0; \
3187  for(l=1; l<=2*m+1; l++) \
3188  { \
3189  fg_psij2 *= fg_psij1; \
3190  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3191  } \
3192  \
3193  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3194  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3195  n0, n1, m, my_u0, my_o0); \
3196 }
3197 
3198 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3199 { \
3200  R psij_const[2*(2*m+2)]; \
3201  INT u, o, l; \
3202  INT ip_u; \
3203  R ip_y, ip_w; \
3204  \
3205  uo(ths,j,&u,&o,(INT)0); \
3206  ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3207  ip_u = LRINT(FLOOR(ip_y)); \
3208  ip_w = ip_y-ip_u; \
3209  for(l=0; l < 2*m+2; l++) \
3210  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3211  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3212  \
3213  uo(ths,j,&u,&o,(INT)1); \
3214  ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3215  ip_u = LRINT(FLOOR(ip_y)); \
3216  ip_w = ip_y-ip_u; \
3217  for(l=0; l < 2*m+2; l++) \
3218  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3219  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3220  \
3221  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3222  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3223  n0, n1, m, my_u0, my_o0); \
3224 }
3225 
3226 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3227 { \
3228  R psij_const[2*(2*m+2)]; \
3229  INT u, o, l; \
3230  \
3231  uo(ths,j,&u,&o,(INT)0); \
3232  for(l=0;l<=2*m+1;l++) \
3233  psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
3234  \
3235  uo(ths,j,&u,&o,(INT)1); \
3236  for(l=0;l<=2*m+1;l++) \
3237  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3238  \
3239  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3240  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3241  n0, n1, m, my_u0, my_o0); \
3242 }
3243 
3244 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3245 { \
3246  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3247  { \
3248  _Pragma("omp parallel private(k)") \
3249  { \
3250  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3251  INT *ar_x = ths->index_x; \
3252  \
3253  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3254  &min_u_b, &max_u_b, 2, ths->n, m); \
3255  \
3256  if (min_u_a != -1) \
3257  { \
3258  k = index_x_binary_search(ar_x, M, min_u_a); \
3259  \
3260  MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3261  \
3262  while (k < M) \
3263  { \
3264  INT u_prod = ar_x[2*k]; \
3265  INT j = ar_x[2*k+1]; \
3266  \
3267  if (u_prod < min_u_a || u_prod > max_u_a) \
3268  break; \
3269  \
3270  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3271  \
3272  k++; \
3273  } \
3274  } \
3275  \
3276  if (min_u_b != -1) \
3277  { \
3278  INT k = index_x_binary_search(ar_x, M, min_u_b); \
3279  \
3280  MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3281  \
3282  while (k < M) \
3283  { \
3284  INT u_prod = ar_x[2*k]; \
3285  INT j = ar_x[2*k+1]; \
3286  \
3287  if (u_prod < min_u_b || u_prod > max_u_b) \
3288  break; \
3289  \
3290  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3291  \
3292  k++; \
3293  } \
3294  } \
3295  } /* omp parallel */ \
3296  return; \
3297  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
3298 }
3299 
3300 
3301 static void nfft_adjoint_2d_B(X(plan) *ths)
3302 {
3303  const INT n0 = ths->n[0];
3304  const INT n1 = ths->n[1];
3305  const INT M = ths->M_total;
3306  const INT m = ths->m;
3307  C* g = (C*) ths->g;
3308  INT k;
3309 
3310  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
3311 
3312  if(ths->flags & PRE_FULL_PSI)
3313  {
3314  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3315  (INT)2, ths->n, m, ths->flags, ths->index_x);
3316  return;
3317  } /* if(PRE_FULL_PSI) */
3318 
3319  if(ths->flags & PRE_PSI)
3320  {
3321 #ifdef _OPENMP
3322  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3323 #endif
3324 
3325 #ifdef _OPENMP
3326  #pragma omp parallel for default(shared) private(k)
3327 #endif
3328  for (k = 0; k < M; k++)
3329  {
3330  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3331 #ifdef _OPENMP
3332  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3333 #else
3334  nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3335 #endif
3336  }
3337  return;
3338  } /* if(PRE_PSI) */
3339 
3340  if(ths->flags & PRE_FG_PSI)
3341  {
3342  R fg_exp_l[2*(2*m+2)];
3343 
3344  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3345  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3346 
3347 #ifdef _OPENMP
3348  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3349 #endif
3350 
3351 
3352 #ifdef _OPENMP
3353  #pragma omp parallel for default(shared) private(k)
3354 #endif
3355  for (k = 0; k < M; k++)
3356  {
3357  R psij_const[2*(2*m+2)];
3358  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3359  INT l;
3360  R fg_psij0 = ths->psi[2*j*2];
3361  R fg_psij1 = ths->psi[2*j*2+1];
3362  R fg_psij2 = K(1.0);
3363 
3364  psij_const[0] = fg_psij0;
3365  for(l=1; l<=2*m+1; l++)
3366  {
3367  fg_psij2 *= fg_psij1;
3368  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3369  }
3370 
3371  fg_psij0 = ths->psi[2*(j*2+1)];
3372  fg_psij1 = ths->psi[2*(j*2+1)+1];
3373  fg_psij2 = K(1.0);
3374  psij_const[2*m+2] = fg_psij0;
3375  for(l=1; l<=2*m+1; l++)
3376  {
3377  fg_psij2 *= fg_psij1;
3378  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3379  }
3380 
3381 #ifdef _OPENMP
3382  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3383 #else
3384  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3385 #endif
3386  }
3387 
3388  return;
3389  } /* if(PRE_FG_PSI) */
3390 
3391  if(ths->flags & FG_PSI)
3392  {
3393  R fg_exp_l[2*(2*m+2)];
3394 
3395  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3396  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3397 
3398  sort(ths);
3399 
3400 #ifdef _OPENMP
3401  MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3402 #endif
3403 
3404 #ifdef _OPENMP
3405  #pragma omp parallel for default(shared) private(k)
3406 #endif
3407  for (k = 0; k < M; k++)
3408  {
3409  INT u, o, l;
3410  R fg_psij0, fg_psij1, fg_psij2;
3411  R psij_const[2*(2*m+2)];
3412  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3413 
3414  uo(ths,j,&u,&o,(INT)0);
3415  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3416  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3417  fg_psij2 = K(1.0);
3418  psij_const[0] = fg_psij0;
3419  for(l=1; l<=2*m+1; l++)
3420  {
3421  fg_psij2 *= fg_psij1;
3422  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3423  }
3424 
3425  uo(ths,j,&u,&o,(INT)1);
3426  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3427  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3428  fg_psij2 = K(1.0);
3429  psij_const[2*m+2] = fg_psij0;
3430  for(l=1; l<=2*m+1; l++)
3431  {
3432  fg_psij2 *= fg_psij1;
3433  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3434  }
3435 
3436 #ifdef _OPENMP
3437  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3438 #else
3439  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3440 #endif
3441  }
3442 
3443  return;
3444  } /* if(FG_PSI) */
3445 
3446  if(ths->flags & PRE_LIN_PSI)
3447  {
3448  const INT K = ths->K;
3449  const INT ip_s = K / (m + 2);
3450 
3451  sort(ths);
3452 
3453 #ifdef _OPENMP
3454  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3455 #endif
3456 
3457 #ifdef _OPENMP
3458  #pragma omp parallel for default(shared) private(k)
3459 #endif
3460  for (k = 0; k < M; k++)
3461  {
3462  INT u,o,l;
3463  INT ip_u;
3464  R ip_y, ip_w;
3465  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3466  R psij_const[2*(2*m+2)];
3467 
3468  uo(ths,j,&u,&o,(INT)0);
3469  ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3470  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3471  ip_w = ip_y - (R)(ip_u);
3472  for(l=0; l < 2*m+2; l++)
3473  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3474  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3475 
3476  uo(ths,j,&u,&o,(INT)1);
3477  ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3478  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3479  ip_w = ip_y - (R)(ip_u);
3480  for(l=0; l < 2*m+2; l++)
3481  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3482  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3483 
3484 #ifdef _OPENMP
3485  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3486 #else
3487  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3488 #endif
3489  }
3490  return;
3491  } /* if(PRE_LIN_PSI) */
3492 
3493  /* no precomputed psi at all */
3494  sort(ths);
3495 
3496 #ifdef _OPENMP
3497  MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3498 #endif
3499 
3500 #ifdef _OPENMP
3501  #pragma omp parallel for default(shared) private(k)
3502 #endif
3503  for (k = 0; k < M; k++)
3504  {
3505  INT u,o,l;
3506  R psij_const[2*(2*m+2)];
3507  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3508 
3509  uo(ths,j,&u,&o,(INT)0);
3510  for(l=0;l<=2*m+1;l++)
3511  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3512 
3513  uo(ths,j,&u,&o,(INT)1);
3514  for(l=0;l<=2*m+1;l++)
3515  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3516 
3517 #ifdef _OPENMP
3518  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3519 #else
3520  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3521 #endif
3522  }
3523 }
3524 
3525 
3526 void X(trafo_2d)(X(plan) *ths)
3527 {
3528  INT k0,k1,n0,n1,N0,N1;
3529  C *g_hat,*f_hat;
3530  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3531  R ck01, ck02, ck11, ck12;
3532  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3533 
3534  ths->g_hat=ths->g1;
3535  ths->g=ths->g2;
3536 
3537  N0=ths->N[0];
3538  N1=ths->N[1];
3539  n0=ths->n[0];
3540  n1=ths->n[1];
3541 
3542  f_hat=(C*)ths->f_hat;
3543  g_hat=(C*)ths->g_hat;
3544 
3545  TIC(0)
3546 #ifdef _OPENMP
3547  #pragma omp parallel for default(shared) private(k0)
3548  for (k0 = 0; k0 < ths->n_total; k0++)
3549  ths->g_hat[k0] = 0.0;
3550 #else
3551  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
3552 #endif
3553  if(ths->flags & PRE_PHI_HUT)
3554  {
3555  c_phi_inv01=ths->c_phi_inv[0];
3556  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3557 
3558 #ifdef _OPENMP
3559  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3560 #endif
3561  for(k0=0;k0<N0/2;k0++)
3562  {
3563  ck01=c_phi_inv01[k0];
3564  ck02=c_phi_inv02[k0];
3565 
3566  c_phi_inv11=ths->c_phi_inv[1];
3567  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3568 
3569  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3570  f_hat11=f_hat + k0*N1;
3571  g_hat21=g_hat + k0*n1+n1-(N1/2);
3572  f_hat21=f_hat + ((N0/2)+k0)*N1;
3573  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3574  f_hat12=f_hat + k0*N1+(N1/2);
3575  g_hat22=g_hat + k0*n1;
3576  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3577 
3578  for(k1=0;k1<N1/2;k1++)
3579  {
3580  ck11=c_phi_inv11[k1];
3581  ck12=c_phi_inv12[k1];
3582 
3583  g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3584  g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3585  g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3586  g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3587  }
3588  }
3589  }
3590  else
3591 #ifdef _OPENMP
3592  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3593 #endif
3594  for(k0=0;k0<N0/2;k0++)
3595  {
3596  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3597  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3598  for(k1=0;k1<N1/2;k1++)
3599  {
3600  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3601  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3602  g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3603  g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3604  g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3605  g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3606  }
3607  }
3608 
3609  TOC(0)
3610 
3611  TIC_FFTW(1)
3612  FFTW(execute)(ths->my_fftw_plan1);
3613  TOC_FFTW(1);
3614 
3615  TIC(2);
3616  nfft_trafo_2d_B(ths);
3617  TOC(2);
3618 }
3619 
3620 void X(adjoint_2d)(X(plan) *ths)
3621 {
3622  INT k0,k1,n0,n1,N0,N1;
3623  C *g_hat,*f_hat;
3624  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3625  R ck01, ck02, ck11, ck12;
3626  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3627 
3628  ths->g_hat=ths->g1;
3629  ths->g=ths->g2;
3630 
3631  N0=ths->N[0];
3632  N1=ths->N[1];
3633  n0=ths->n[0];
3634  n1=ths->n[1];
3635 
3636  f_hat=(C*)ths->f_hat;
3637  g_hat=(C*)ths->g_hat;
3638 
3639  TIC(2);
3640  nfft_adjoint_2d_B(ths);
3641  TOC(2);
3642 
3643  TIC_FFTW(1)
3644  FFTW(execute)(ths->my_fftw_plan2);
3645  TOC_FFTW(1);
3646 
3647  TIC(0)
3648  if(ths->flags & PRE_PHI_HUT)
3649  {
3650  c_phi_inv01=ths->c_phi_inv[0];
3651  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3652 
3653 #ifdef _OPENMP
3654  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3655 #endif
3656  for(k0=0;k0<N0/2;k0++)
3657  {
3658  ck01=c_phi_inv01[k0];
3659  ck02=c_phi_inv02[k0];
3660 
3661  c_phi_inv11=ths->c_phi_inv[1];
3662  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3663 
3664  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3665  f_hat11=f_hat + k0*N1;
3666  g_hat21=g_hat + k0*n1+n1-(N1/2);
3667  f_hat21=f_hat + ((N0/2)+k0)*N1;
3668  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3669  f_hat12=f_hat + k0*N1+(N1/2);
3670  g_hat22=g_hat + k0*n1;
3671  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3672 
3673  for(k1=0;k1<N1/2;k1++)
3674  {
3675  ck11=c_phi_inv11[k1];
3676  ck12=c_phi_inv12[k1];
3677 
3678  f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3679  f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3680  f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3681  f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3682  }
3683  }
3684  }
3685  else
3686 #ifdef _OPENMP
3687  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3688 #endif
3689  for(k0=0;k0<N0/2;k0++)
3690  {
3691  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3692  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3693  for(k1=0;k1<N1/2;k1++)
3694  {
3695  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3696  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3697  f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3698  f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3699  f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3700  f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3701  }
3702  }
3703  TOC(0)
3704 }
3705 
3706 /* ################################################ SPECIFIC VERSIONS FOR d=3 */
3707 
3708 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
3709 {
3710  INT l;
3711  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3712 
3713  fg_exp_b0 = EXP(-K(1.0) / b);
3714  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3715  fg_exp_b1 = K(1.0);
3716  fg_exp_b2 = K(1.0);
3717  fg_exp_l[0] = K(1.0);
3718  for(l=1; l <= 2*m+1; l++)
3719  {
3720  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3721  fg_exp_b1 *= fg_exp_b0_sq;
3722  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3723  }
3724 }
3725 
3726 static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0,
3727  const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1,
3728  const R *xj2, const INT n0, const INT n1, const INT n2, const INT m)
3729 {
3730  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
3731  const C *gj;
3732  const R *psij0, *psij1, *psij2;
3733 
3734  psij0 = psij_const0;
3735  psij1 = psij_const1;
3736  psij2 = psij_const2;
3737 
3738  uo2(&u0, &o0, *xj0, n0, m);
3739  uo2(&u1, &o1, *xj1, n1, m);
3740  uo2(&u2, &o2, *xj2, n2, m);
3741 
3742  *fj = 0;
3743 
3744  if (u0 < o0)
3745  if (u1 < o1)
3746  if (u2 < o2)
3747  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3748  {
3749  psij1 = psij_const1;
3750  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3751  {
3752  psij2 = psij_const2;
3753  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3754  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3755  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3756  }
3757  }
3758  else
3759  /* asserts (u2>o2)*/
3760  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3761  {
3762  psij1 = psij_const1;
3763  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3764  {
3765  psij2 = psij_const2;
3766  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3767  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3768  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3769  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3770  for (l2 = 0; l2 <= o2; l2++)
3771  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3772  }
3773  }
3774  else /* asserts (u1>o1)*/
3775  if (u2 < o2)
3776  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3777  {
3778  psij1 = psij_const1;
3779  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3780  {
3781  psij2 = psij_const2;
3782  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3783  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3784  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3785  }
3786  for (l1 = 0; l1 <= o1; l1++, psij1++)
3787  {
3788  psij2 = psij_const2;
3789  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3790  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3791  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3792  }
3793  }
3794  else/* asserts (u2>o2) */
3795  {
3796  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3797  {
3798  psij1 = psij_const1;
3799  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3800  {
3801  psij2 = psij_const2;
3802  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3803  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3804  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3805  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3806  for (l2 = 0; l2 <= o2; l2++)
3807  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3808  }
3809  for (l1 = 0; l1 <= o1; l1++, psij1++)
3810  {
3811  psij2 = psij_const2;
3812  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3813  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3814  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3815  gj = g + ((u0 + l0) * n1 + l1) * n2;
3816  for (l2 = 0; l2 <= o2; l2++)
3817  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3818  }
3819  }
3820  }
3821  else /* asserts (u0>o0) */
3822  if (u1 < o1)
3823  if (u2 < o2)
3824  {
3825  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3826  {
3827  psij1 = psij_const1;
3828  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3829  {
3830  psij2 = psij_const2;
3831  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3832  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3833  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3834  }
3835  }
3836 
3837  for (l0 = 0; l0 <= o0; l0++, psij0++)
3838  {
3839  psij1 = psij_const1;
3840  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3841  {
3842  psij2 = psij_const2;
3843  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3844  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3845  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3846  }
3847  }
3848  } else/* asserts (u2>o2) */
3849  {
3850  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3851  {
3852  psij1 = psij_const1;
3853  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3854  {
3855  psij2 = psij_const2;
3856  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3857  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3858  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3859  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3860  for (l2 = 0; l2 <= o2; l2++)
3861  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3862  }
3863  }
3864 
3865  for (l0 = 0; l0 <= o0; l0++, psij0++)
3866  {
3867  psij1 = psij_const1;
3868  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3869  {
3870  psij2 = psij_const2;
3871  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3872  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3873  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3874  gj = g + (l0 * n1 + (u1 + l1)) * n2;
3875  for (l2 = 0; l2 <= o2; l2++)
3876  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3877  }
3878  }
3879  }
3880  else /* asserts (u1>o1) */
3881  if (u2 < o2)
3882  {
3883  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3884  {
3885  psij1 = psij_const1;
3886  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3887  {
3888  psij2 = psij_const2;
3889  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3890  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3891  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3892  }
3893  for (l1 = 0; l1 <= o1; l1++, psij1++)
3894  {
3895  psij2 = psij_const2;
3896  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3897  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3898  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3899  }
3900  }
3901  for (l0 = 0; l0 <= o0; l0++, psij0++)
3902  {
3903  psij1 = psij_const1;
3904  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3905  {
3906  psij2 = psij_const2;
3907  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3908  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3909  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3910  }
3911  for (l1 = 0; l1 <= o1; l1++, psij1++)
3912  {
3913  psij2 = psij_const2;
3914  gj = g + (l0 * n1 + l1) * n2 + u2;
3915  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3916  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3917  }
3918  }
3919  } else/* asserts (u2>o2) */
3920  {
3921  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3922  {
3923  psij1 = psij_const1;
3924  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3925  {
3926  psij2 = psij_const2;
3927  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3928  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3929  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3930  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3931  for (l2 = 0; l2 <= o2; l2++)
3932  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3933  }
3934  for (l1 = 0; l1 <= o1; l1++, psij1++)
3935  {
3936  psij2 = psij_const2;
3937  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3938  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3939  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3940  gj = g + ((u0 + l0) * n1 + l1) * n2;
3941  for (l2 = 0; l2 <= o2; l2++)
3942  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3943  }
3944  }
3945 
3946  for (l0 = 0; l0 <= o0; l0++, psij0++)
3947  {
3948  psij1 = psij_const1;
3949  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3950  {
3951  psij2 = psij_const2;
3952  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3953  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3954  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3955  gj = g + (l0 * n1 + (u1 + l1)) * n2;
3956  for (l2 = 0; l2 <= o2; l2++)
3957  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3958  }
3959  for (l1 = 0; l1 <= o1; l1++, psij1++)
3960  {
3961  psij2 = psij_const2;
3962  gj = g + (l0 * n1 + l1) * n2 + u2;
3963  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3964  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3965  gj = g + (l0 * n1 + l1) * n2;
3966  for (l2 = 0; l2 <= o2; l2++)
3967  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3968  }
3969  }
3970  }
3971 }
3972 
3973 #ifdef _OPENMP
3974 
3995 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
3996  const R *psij_const0, const R *psij_const1, const R *psij_const2,
3997  const R *xj0, const R *xj1, const R *xj2,
3998  const INT n0, const INT n1, const INT n2, const INT m,
3999  const INT my_u0, const INT my_o0)
4000 {
4001  INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4002  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4003 
4004  INT index_temp1[2*m+2];
4005  INT index_temp2[2*m+2];
4006 
4007  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4008  uo2(&u1,&o1,*xj1, n1, m);
4009  uo2(&u2,&o2,*xj2, n2, m);
4010 
4011  for (l1=0; l1<=2*m+1; l1++)
4012  index_temp1[l1] = (u1+l1)%n1;
4013 
4014  for (l2=0; l2<=2*m+1; l2++)
4015  index_temp2[l2] = (u2+l2)%n2;
4016 
4017  if(ar_u0<ar_o0)
4018  {
4019  INT u0 = MAX(my_u0,ar_u0);
4020  INT o0 = MIN(my_o0,ar_o0);
4021  INT offset_psij = u0-ar_u0;
4022 #ifdef OMP_ASSERT
4023  assert(offset_psij >= 0);
4024  assert(o0-u0 <= 2*m+1);
4025  assert(offset_psij+o0-u0 <= 2*m+1);
4026 #endif
4027 
4028  for (l0 = 0; l0 <= o0-u0; l0++)
4029  {
4030  const INT i0 = (u0+l0) * n1;
4031  const C val0 = psij_const0[offset_psij+l0];
4032 
4033  for(l1=0; l1<=2*m+1; l1++)
4034  {
4035  const INT i1 = (i0 + index_temp1[l1]) * n2;
4036  const C val1 = psij_const1[l1];
4037 
4038  for(l2=0; l2<=2*m+1; l2++)
4039  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4040  }
4041  }
4042  }
4043  else
4044  {
4045  INT u0 = MAX(my_u0,ar_u0);
4046  INT o0 = my_o0;
4047  INT offset_psij = u0-ar_u0;
4048 #ifdef OMP_ASSERT
4049  assert(offset_psij >= 0);
4050  assert(o0-u0 <= 2*m+1);
4051  assert(offset_psij+o0-u0 <= 2*m+1);
4052 #endif
4053 
4054  for (l0 = 0; l0 <= o0-u0; l0++)
4055  {
4056  INT i0 = (u0+l0) * n1;
4057  const C val0 = psij_const0[offset_psij+l0];
4058 
4059  for(l1=0; l1<=2*m+1; l1++)
4060  {
4061  const INT i1 = (i0 + index_temp1[l1]) * n2;
4062  const C val1 = psij_const1[l1];
4063 
4064  for(l2=0; l2<=2*m+1; l2++)
4065  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4066  }
4067  }
4068 
4069  u0 = my_u0;
4070  o0 = MIN(my_o0,ar_o0);
4071  offset_psij += my_u0-ar_u0+n0;
4072 
4073 #ifdef OMP_ASSERT
4074  if (u0<=o0)
4075  {
4076  assert(o0-u0 <= 2*m+1);
4077  assert(offset_psij+o0-u0 <= 2*m+1);
4078  }
4079 #endif
4080  for (l0 = 0; l0 <= o0-u0; l0++)
4081  {
4082  INT i0 = (u0+l0) * n1;
4083  const C val0 = psij_const0[offset_psij+l0];
4084 
4085  for(l1=0; l1<=2*m+1; l1++)
4086  {
4087  const INT i1 = (i0 + index_temp1[l1]) * n2;
4088  const C val1 = psij_const1[l1];
4089 
4090  for(l2=0; l2<=2*m+1; l2++)
4091  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4092  }
4093  }
4094  }
4095 }
4096 #endif
4097 
4098 #ifdef _OPENMP
4099 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
4100 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
4101  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4102  const R *xj0, const R *xj1, const R *xj2,
4103  const INT n0, const INT n1, const INT n2, const INT m)
4104 {
4105  INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4106  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4107 
4108  INT index_temp0[2*m+2];
4109  INT index_temp1[2*m+2];
4110  INT index_temp2[2*m+2];
4111 
4112  uo2(&u0,&o0,*xj0, n0, m);
4113  uo2(&u1,&o1,*xj1, n1, m);
4114  uo2(&u2,&o2,*xj2, n2, m);
4115 
4116  for (l0=0; l0<=2*m+1; l0++)
4117  index_temp0[l0] = (u0+l0)%n0;
4118 
4119  for (l1=0; l1<=2*m+1; l1++)
4120  index_temp1[l1] = (u1+l1)%n1;
4121 
4122  for (l2=0; l2<=2*m+1; l2++)
4123  index_temp2[l2] = (u2+l2)%n2;
4124 
4125  for(l0=0; l0<=2*m+1; l0++)
4126  {
4127  for(l1=0; l1<=2*m+1; l1++)
4128  {
4129  for(l2=0; l2<=2*m+1; l2++)
4130  {
4131  INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4132  C *lhs = g+i;
4133  R *lhs_real = (R*)lhs;
4134  C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4135 
4136 #pragma omp atomic
4137  lhs_real[0] += creal(val);
4138 
4139 #pragma omp atomic
4140  lhs_real[1] += cimag(val);
4141  }
4142  }
4143  }
4144 }
4145 #endif
4146 
4147 #ifndef _OPENMP
4148 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
4149  const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0,
4150  const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2,
4151  const INT m)
4152 {
4153  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4154  C *gj;
4155  const R *psij0, *psij1, *psij2;
4156 
4157  psij0 = psij_const0;
4158  psij1 = psij_const1;
4159  psij2 = psij_const2;
4160 
4161  uo2(&u0, &o0, *xj0, n0, m);
4162  uo2(&u1, &o1, *xj1, n1, m);
4163  uo2(&u2, &o2, *xj2, n2, m);
4164 
4165  if (u0 < o0)
4166  if (u1 < o1)
4167  if (u2 < o2)
4168  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4169  {
4170  psij1 = psij_const1;
4171  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4172  {
4173  psij2 = psij_const2;
4174  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4175  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4176  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4177  }
4178  }
4179  else
4180  /* asserts (u2>o2)*/
4181  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4182  {
4183  psij1 = psij_const1;
4184  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4185  {
4186  psij2 = psij_const2;
4187  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4188  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4189  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4190  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4191  for (l2 = 0; l2 <= o2; l2++)
4192  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4193  }
4194  }
4195  else /* asserts (u1>o1)*/
4196  if (u2 < o2)
4197  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4198  {
4199  psij1 = psij_const1;
4200  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4201  {
4202  psij2 = psij_const2;
4203  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4204  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4205  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4206  }
4207  for (l1 = 0; l1 <= o1; l1++, psij1++)
4208  {
4209  psij2 = psij_const2;
4210  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4211  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4212  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4213  }
4214  }
4215  else/* asserts (u2>o2) */
4216  {
4217  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4218  {
4219  psij1 = psij_const1;
4220  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4221  {
4222  psij2 = psij_const2;
4223  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4224  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4225  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4226  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4227  for (l2 = 0; l2 <= o2; l2++)
4228  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4229  }
4230  for (l1 = 0; l1 <= o1; l1++, psij1++)
4231  {
4232  psij2 = psij_const2;
4233  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4234  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4235  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4236  gj = g + ((u0 + l0) * n1 + l1) * n2;
4237  for (l2 = 0; l2 <= o2; l2++)
4238  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4239  }
4240  }
4241  }
4242  else /* asserts (u0>o0) */
4243  if (u1 < o1)
4244  if (u2 < o2)
4245  {
4246  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4247  {
4248  psij1 = psij_const1;
4249  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4250  {
4251  psij2 = psij_const2;
4252  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4253  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4254  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4255  }
4256  }
4257 
4258  for (l0 = 0; l0 <= o0; l0++, psij0++)
4259  {
4260  psij1 = psij_const1;
4261  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4262  {
4263  psij2 = psij_const2;
4264  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4265  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4266  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4267  }
4268  }
4269  } else/* asserts (u2>o2) */
4270  {
4271  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4272  {
4273  psij1 = psij_const1;
4274  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4275  {
4276  psij2 = psij_const2;
4277  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4278  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4279  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4280  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4281  for (l2 = 0; l2 <= o2; l2++)
4282  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4283  }
4284  }
4285 
4286  for (l0 = 0; l0 <= o0; l0++, psij0++)
4287  {
4288  psij1 = psij_const1;
4289  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4290  {
4291  psij2 = psij_const2;
4292  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4293  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4294  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4295  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4296  for (l2 = 0; l2 <= o2; l2++)
4297  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4298  }
4299  }
4300  }
4301  else /* asserts (u1>o1) */
4302  if (u2 < o2)
4303  {
4304  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4305  {
4306  psij1 = psij_const1;
4307  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4308  {
4309  psij2 = psij_const2;
4310  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4311  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4312  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4313  }
4314  for (l1 = 0; l1 <= o1; l1++, psij1++)
4315  {
4316  psij2 = psij_const2;
4317  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4318  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4319  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4320  }
4321  }
4322  for (l0 = 0; l0 <= o0; l0++, psij0++)
4323  {
4324  psij1 = psij_const1;
4325  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4326  {
4327  psij2 = psij_const2;
4328  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4329  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4330  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4331  }
4332  for (l1 = 0; l1 <= o1; l1++, psij1++)
4333  {
4334  psij2 = psij_const2;
4335  gj = g + (l0 * n1 + l1) * n2 + u2;
4336  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4337  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4338  }
4339  }
4340  } else/* asserts (u2>o2) */
4341  {
4342  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4343  {
4344  psij1 = psij_const1;
4345  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4346  {
4347  psij2 = psij_const2;
4348  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4349  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4350  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4351  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4352  for (l2 = 0; l2 <= o2; l2++)
4353  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4354  }
4355  for (l1 = 0; l1 <= o1; l1++, psij1++)
4356  {
4357  psij2 = psij_const2;
4358  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4359  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4360  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4361  gj = g + ((u0 + l0) * n1 + l1) * n2;
4362  for (l2 = 0; l2 <= o2; l2++)
4363  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4364  }
4365  }
4366 
4367  for (l0 = 0; l0 <= o0; l0++, psij0++)
4368  {
4369  psij1 = psij_const1;
4370  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4371  {
4372  psij2 = psij_const2;
4373  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4374  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4375  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4376  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4377  for (l2 = 0; l2 <= o2; l2++)
4378  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4379  }
4380  for (l1 = 0; l1 <= o1; l1++, psij1++)
4381  {
4382  psij2 = psij_const2;
4383  gj = g + (l0 * n1 + l1) * n2 + u2;
4384  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4385  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4386  gj = g + (l0 * n1 + l1) * n2;
4387  for (l2 = 0; l2 <= o2; l2++)
4388  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4389  }
4390  }
4391  }
4392 }
4393 #endif
4394 
4395 static void nfft_trafo_3d_B(X(plan) *ths)
4396 {
4397  const INT n0 = ths->n[0];
4398  const INT n1 = ths->n[1];
4399  const INT n2 = ths->n[2];
4400  const INT M = ths->M_total;
4401  const INT m = ths->m;
4402 
4403  const C* g = (C*) ths->g;
4404 
4405  INT k;
4406 
4407  if(ths->flags & PRE_FULL_PSI)
4408  {
4409  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4410 #ifdef _OPENMP
4411  #pragma omp parallel for default(shared) private(k)
4412 #endif
4413  for (k = 0; k < M; k++)
4414  {
4415  INT l;
4416  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4417  ths->f[j] = K(0.0);
4418  for (l = 0; l < lprod; l++)
4419  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4420  }
4421  return;
4422  } /* if(PRE_FULL_PSI) */
4423 
4424  if(ths->flags & PRE_PSI)
4425  {
4426 #ifdef _OPENMP
4427  #pragma omp parallel for default(shared) private(k)
4428 #endif
4429  for (k = 0; k < M; k++)
4430  {
4431  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4432  nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4433  }
4434  return;
4435  } /* if(PRE_PSI) */
4436 
4437  if(ths->flags & PRE_FG_PSI)
4438  {
4439  R fg_exp_l[3*(2*m+2)];
4440 
4441  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4442  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4443  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4444 
4445 #ifdef _OPENMP
4446  #pragma omp parallel for default(shared) private(k)
4447 #endif
4448  for (k = 0; k < M; k++)
4449  {
4450  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4451  INT l;
4452  R psij_const[3*(2*m+2)];
4453  R fg_psij0 = ths->psi[2*j*3];
4454  R fg_psij1 = ths->psi[2*j*3+1];
4455  R fg_psij2 = K(1.0);
4456 
4457  psij_const[0] = fg_psij0;
4458  for(l=1; l<=2*m+1; l++)
4459  {
4460  fg_psij2 *= fg_psij1;
4461  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4462  }
4463 
4464  fg_psij0 = ths->psi[2*(j*3+1)];
4465  fg_psij1 = ths->psi[2*(j*3+1)+1];
4466  fg_psij2 = K(1.0);
4467  psij_const[2*m+2] = fg_psij0;
4468  for(l=1; l<=2*m+1; l++)
4469  {
4470  fg_psij2 *= fg_psij1;
4471  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4472  }
4473 
4474  fg_psij0 = ths->psi[2*(j*3+2)];
4475  fg_psij1 = ths->psi[2*(j*3+2)+1];
4476  fg_psij2 = K(1.0);
4477  psij_const[2*(2*m+2)] = fg_psij0;
4478  for(l=1; l<=2*m+1; l++)
4479  {
4480  fg_psij2 *= fg_psij1;
4481  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4482  }
4483 
4484  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4485  }
4486 
4487  return;
4488  } /* if(PRE_FG_PSI) */
4489 
4490  if(ths->flags & FG_PSI)
4491  {
4492  R fg_exp_l[3*(2*m+2)];
4493 
4494  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4495  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4496  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4497 
4498  sort(ths);
4499 
4500 #ifdef _OPENMP
4501  #pragma omp parallel for default(shared) private(k)
4502 #endif
4503  for (k = 0; k < M; k++)
4504  {
4505  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4506  INT u, o, l;
4507  R psij_const[3*(2*m+2)];
4508  R fg_psij0, fg_psij1, fg_psij2;
4509 
4510  uo(ths,j,&u,&o,(INT)0);
4511  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4512  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4513  fg_psij2 = K(1.0);
4514  psij_const[0] = fg_psij0;
4515  for(l=1; l<=2*m+1; l++)
4516  {
4517  fg_psij2 *= fg_psij1;
4518  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4519  }
4520 
4521  uo(ths,j,&u,&o,(INT)1);
4522  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4523  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4524  fg_psij2 = K(1.0);
4525  psij_const[2*m+2] = fg_psij0;
4526  for(l=1; l<=2*m+1; l++)
4527  {
4528  fg_psij2 *= fg_psij1;
4529  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4530  }
4531 
4532  uo(ths,j,&u,&o,(INT)2);
4533  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4534  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4535  fg_psij2 = K(1.0);
4536  psij_const[2*(2*m+2)] = fg_psij0;
4537  for(l=1; l<=2*m+1; l++)
4538  {
4539  fg_psij2 *= fg_psij1;
4540  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4541  }
4542 
4543  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4544  }
4545 
4546  return;
4547  } /* if(FG_PSI) */
4548 
4549  if(ths->flags & PRE_LIN_PSI)
4550  {
4551  const INT K = ths->K, ip_s = K / (m + 2);
4552 
4553  sort(ths);
4554 
4555 #ifdef _OPENMP
4556  #pragma omp parallel for default(shared) private(k)
4557 #endif
4558  for (k = 0; k < M; k++)
4559  {
4560  INT u, o, l;
4561  R ip_y, ip_w;
4562  INT ip_u;
4563  R psij_const[3*(2*m+2)];
4564  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4565 
4566  uo(ths,j,&u,&o,(INT)0);
4567  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4568  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4569  ip_w = ip_y - (R)(ip_u);
4570  for(l=0; l < 2*m+2; l++)
4571  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4572  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4573 
4574  uo(ths,j,&u,&o,(INT)1);
4575  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4576  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4577  ip_w = ip_y - (R)(ip_u);
4578  for(l=0; l < 2*m+2; l++)
4579  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4580  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4581 
4582  uo(ths,j,&u,&o,(INT)2);
4583  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4584  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4585  ip_w = ip_y - (R)(ip_u);
4586  for(l=0; l < 2*m+2; l++)
4587  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4588  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4589 
4590  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4591  }
4592  return;
4593  } /* if(PRE_LIN_PSI) */
4594 
4595  /* no precomputed psi at all */
4596 
4597  sort(ths);
4598 
4599 #ifdef _OPENMP
4600  #pragma omp parallel for default(shared) private(k)
4601 #endif
4602  for (k = 0; k < M; k++)
4603  {
4604  R psij_const[3*(2*m+2)];
4605  INT u, o, l;
4606  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4607 
4608  uo(ths,j,&u,&o,(INT)0);
4609  for(l=0;l<=2*m+1;l++)
4610  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4611 
4612  uo(ths,j,&u,&o,(INT)1);
4613  for(l=0;l<=2*m+1;l++)
4614  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4615 
4616  uo(ths,j,&u,&o,(INT)2);
4617  for(l=0;l<=2*m+1;l++)
4618  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4619 
4620  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4621  }
4622 }
4623 
4624 #ifdef OMP_ASSERT
4625 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4626 { \
4627  assert(ar_x[2*k] >= min_u_a || k == M-1); \
4628  if (k > 0) \
4629  assert(ar_x[2*k-2] < min_u_a); \
4630 }
4631 #else
4632 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4633 #endif
4634 
4635 #ifdef OMP_ASSERT
4636 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4637 { \
4638  assert(ar_x[2*k] >= min_u_b || k == M-1); \
4639  if (k > 0) \
4640  assert(ar_x[2*k-2] < min_u_b); \
4641 }
4642 #else
4643 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4644 #endif
4645 
4646 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4647  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4648  ths->psi+j*3*(2*m+2), \
4649  ths->psi+(j*3+1)*(2*m+2), \
4650  ths->psi+(j*3+2)*(2*m+2), \
4651  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4652  n0, n1, n2, m, my_u0, my_o0);
4653 
4654 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4655 { \
4656  INT u, o, l; \
4657  R psij_const[3*(2*m+2)]; \
4658  R fg_psij0 = ths->psi[2*j*3]; \
4659  R fg_psij1 = ths->psi[2*j*3+1]; \
4660  R fg_psij2 = K(1.0); \
4661  \
4662  psij_const[0] = fg_psij0; \
4663  for(l=1; l<=2*m+1; l++) \
4664  { \
4665  fg_psij2 *= fg_psij1; \
4666  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4667  } \
4668  \
4669  fg_psij0 = ths->psi[2*(j*3+1)]; \
4670  fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4671  fg_psij2 = K(1.0); \
4672  psij_const[2*m+2] = fg_psij0; \
4673  for(l=1; l<=2*m+1; l++) \
4674  { \
4675  fg_psij2 *= fg_psij1; \
4676  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4677  } \
4678  \
4679  fg_psij0 = ths->psi[2*(j*3+2)]; \
4680  fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4681  fg_psij2 = K(1.0); \
4682  psij_const[2*(2*m+2)] = fg_psij0; \
4683  for(l=1; l<=2*m+1; l++) \
4684  { \
4685  fg_psij2 *= fg_psij1; \
4686  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4687  } \
4688  \
4689  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4690  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4691  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4692  n0, n1, n2, m, my_u0, my_o0); \
4693 }
4694 
4695 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4696 { \
4697  INT u, o, l; \
4698  R psij_const[3*(2*m+2)]; \
4699  R fg_psij0, fg_psij1, fg_psij2; \
4700  \
4701  uo(ths,j,&u,&o,(INT)0); \
4702  fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
4703  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4704  fg_psij2 = K(1.0); \
4705  psij_const[0] = fg_psij0; \
4706  for(l=1; l<=2*m+1; l++) \
4707  { \
4708  fg_psij2 *= fg_psij1; \
4709  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4710  } \
4711  \
4712  uo(ths,j,&u,&o,(INT)1); \
4713  fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
4714  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4715  fg_psij2 = K(1.0); \
4716  psij_const[2*m+2] = fg_psij0; \
4717  for(l=1; l<=2*m+1; l++) \
4718  { \
4719  fg_psij2 *= fg_psij1; \
4720  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4721  } \
4722  \
4723  uo(ths,j,&u,&o,(INT)2); \
4724  fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
4725  fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4726  fg_psij2 = K(1.0); \
4727  psij_const[2*(2*m+2)] = fg_psij0; \
4728  for(l=1; l<=2*m+1; l++) \
4729  { \
4730  fg_psij2 *= fg_psij1; \
4731  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4732  } \
4733  \
4734  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4735  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4736  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4737  n0, n1, n2, m, my_u0, my_o0); \
4738 }
4739 
4740 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4741 { \
4742  INT u, o, l; \
4743  R psij_const[3*(2*m+2)]; \
4744  INT ip_u; \
4745  R ip_y, ip_w; \
4746  \
4747  uo(ths,j,&u,&o,(INT)0); \
4748  ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4749  ip_u = LRINT(FLOOR(ip_y)); \
4750  ip_w = ip_y-ip_u; \
4751  for(l=0; l < 2*m+2; l++) \
4752  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4753  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4754  \
4755  uo(ths,j,&u,&o,(INT)1); \
4756  ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4757  ip_u = LRINT(FLOOR(ip_y)); \
4758  ip_w = ip_y-ip_u; \
4759  for(l=0; l < 2*m+2; l++) \
4760  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4761  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4762  \
4763  uo(ths,j,&u,&o,(INT)2); \
4764  ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4765  ip_u = LRINT(FLOOR(ip_y)); \
4766  ip_w = ip_y-ip_u; \
4767  for(l=0; l < 2*m+2; l++) \
4768  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4769  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4770  \
4771  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4772  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4773  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4774  n0, n1, n2, m, my_u0, my_o0); \
4775 }
4776 
4777 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4778 { \
4779  INT u, o, l; \
4780  R psij_const[3*(2*m+2)]; \
4781  \
4782  uo(ths,j,&u,&o,(INT)0); \
4783  for(l=0;l<=2*m+1;l++) \
4784  psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
4785  \
4786  uo(ths,j,&u,&o,(INT)1); \
4787  for(l=0;l<=2*m+1;l++) \
4788  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4789  \
4790  uo(ths,j,&u,&o,(INT)2); \
4791  for(l=0;l<=2*m+1;l++) \
4792  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4793  \
4794  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4795  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4796  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4797  n0, n1, n2, m, my_u0, my_o0); \
4798 }
4799 
4800 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4801 { \
4802  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4803  { \
4804  _Pragma("omp parallel private(k)") \
4805  { \
4806  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4807  INT *ar_x = ths->index_x; \
4808  \
4809  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4810  &min_u_b, &max_u_b, 3, ths->n, m); \
4811  \
4812  if (min_u_a != -1) \
4813  { \
4814  k = index_x_binary_search(ar_x, M, min_u_a); \
4815  \
4816  MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4817  \
4818  while (k < M) \
4819  { \
4820  INT u_prod = ar_x[2*k]; \
4821  INT j = ar_x[2*k+1]; \
4822  \
4823  if (u_prod < min_u_a || u_prod > max_u_a) \
4824  break; \
4825  \
4826  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4827  \
4828  k++; \
4829  } \
4830  } \
4831  \
4832  if (min_u_b != -1) \
4833  { \
4834  INT k = index_x_binary_search(ar_x, M, min_u_b); \
4835  \
4836  MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4837  \
4838  while (k < M) \
4839  { \
4840  INT u_prod = ar_x[2*k]; \
4841  INT j = ar_x[2*k+1]; \
4842  \
4843  if (u_prod < min_u_b || u_prod > max_u_b) \
4844  break; \
4845  \
4846  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4847  \
4848  k++; \
4849  } \
4850  } \
4851  } /* omp parallel */ \
4852  return; \
4853  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
4854 }
4855 
4856 static void nfft_adjoint_3d_B(X(plan) *ths)
4857 {
4858  INT k;
4859  const INT n0 = ths->n[0];
4860  const INT n1 = ths->n[1];
4861  const INT n2 = ths->n[2];
4862  const INT M = ths->M_total;
4863  const INT m = ths->m;
4864 
4865  C* g = (C*) ths->g;
4866 
4867  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
4868 
4869  if(ths->flags & PRE_FULL_PSI)
4870  {
4871  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4872  (INT)3, ths->n, m, ths->flags, ths->index_x);
4873  return;
4874  } /* if(PRE_FULL_PSI) */
4875 
4876  if(ths->flags & PRE_PSI)
4877  {
4878 #ifdef _OPENMP
4879  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4880 #endif
4881 
4882 #ifdef _OPENMP
4883  #pragma omp parallel for default(shared) private(k)
4884 #endif
4885  for (k = 0; k < M; k++)
4886  {
4887  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4888 #ifdef _OPENMP
4889  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4890 #else
4891  nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4892 #endif
4893  }
4894  return;
4895  } /* if(PRE_PSI) */
4896 
4897  if(ths->flags & PRE_FG_PSI)
4898  {
4899  R fg_exp_l[3*(2*m+2)];
4900 
4901  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4902  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4903  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4904 
4905 #ifdef _OPENMP
4906  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4907 #endif
4908 
4909 #ifdef _OPENMP
4910  #pragma omp parallel for default(shared) private(k)
4911 #endif
4912  for (k = 0; k < M; k++)
4913  {
4914  R psij_const[3*(2*m+2)];
4915  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4916  INT l;
4917  R fg_psij0 = ths->psi[2*j*3];
4918  R fg_psij1 = ths->psi[2*j*3+1];
4919  R fg_psij2 = K(1.0);
4920 
4921  psij_const[0] = fg_psij0;
4922  for(l=1; l<=2*m+1; l++)
4923  {
4924  fg_psij2 *= fg_psij1;
4925  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4926  }
4927 
4928  fg_psij0 = ths->psi[2*(j*3+1)];
4929  fg_psij1 = ths->psi[2*(j*3+1)+1];
4930  fg_psij2 = K(1.0);
4931  psij_const[2*m+2] = fg_psij0;
4932  for(l=1; l<=2*m+1; l++)
4933  {
4934  fg_psij2 *= fg_psij1;
4935  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4936  }
4937 
4938  fg_psij0 = ths->psi[2*(j*3+2)];
4939  fg_psij1 = ths->psi[2*(j*3+2)+1];
4940  fg_psij2 = K(1.0);
4941  psij_const[2*(2*m+2)] = fg_psij0;
4942  for(l=1; l<=2*m+1; l++)
4943  {
4944  fg_psij2 *= fg_psij1;
4945  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4946  }
4947 
4948 #ifdef _OPENMP
4949  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4950 #else
4951  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4952 #endif
4953  }
4954 
4955  return;
4956  } /* if(PRE_FG_PSI) */
4957 
4958  if(ths->flags & FG_PSI)
4959  {
4960  R fg_exp_l[3*(2*m+2)];
4961 
4962  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4963  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4964  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4965 
4966  sort(ths);
4967 
4968 #ifdef _OPENMP
4969  MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4970 #endif
4971 
4972 #ifdef _OPENMP
4973  #pragma omp parallel for default(shared) private(k)
4974 #endif
4975  for (k = 0; k < M; k++)
4976  {
4977  INT u,o,l;
4978  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4979  R psij_const[3*(2*m+2)];
4980  R fg_psij0, fg_psij1, fg_psij2;
4981 
4982  uo(ths,j,&u,&o,(INT)0);
4983  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4984  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
4985  fg_psij2 = K(1.0);
4986  psij_const[0] = fg_psij0;
4987  for(l=1; l<=2*m+1; l++)
4988  {
4989  fg_psij2 *= fg_psij1;
4990  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4991  }
4992 
4993  uo(ths,j,&u,&o,(INT)1);
4994  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4995  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
4996  fg_psij2 = K(1.0);
4997  psij_const[2*m+2] = fg_psij0;
4998  for(l=1; l<=2*m+1; l++)
4999  {
5000  fg_psij2 *= fg_psij1;
5001  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5002  }
5003 
5004  uo(ths,j,&u,&o,(INT)2);
5005  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5006  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5007  fg_psij2 = K(1.0);
5008  psij_const[2*(2*m+2)] = fg_psij0;
5009  for(l=1; l<=2*m+1; l++)
5010  {
5011  fg_psij2 *= fg_psij1;
5012  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5013  }
5014 
5015 #ifdef _OPENMP
5016  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5017 #else
5018  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5019 #endif
5020  }
5021 
5022  return;
5023  } /* if(FG_PSI) */
5024 
5025  if(ths->flags & PRE_LIN_PSI)
5026  {
5027  const INT K = ths->K;
5028  const INT ip_s = K / (m + 2);
5029 
5030  sort(ths);
5031 
5032 #ifdef _OPENMP
5033  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5034 #endif
5035 
5036 #ifdef _OPENMP
5037  #pragma omp parallel for default(shared) private(k)
5038 #endif
5039  for (k = 0; k < M; k++)
5040  {
5041  INT u,o,l;
5042  INT ip_u;
5043  R ip_y, ip_w;
5044  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5045  R psij_const[3*(2*m+2)];
5046 
5047  uo(ths,j,&u,&o,(INT)0);
5048  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5049  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5050  ip_w = ip_y - (R)(ip_u);
5051  for(l=0; l < 2*m+2; l++)
5052  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5053  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5054 
5055  uo(ths,j,&u,&o,(INT)1);
5056  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5057  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5058  ip_w = ip_y - (R)(ip_u);
5059  for(l=0; l < 2*m+2; l++)
5060  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5061  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5062 
5063  uo(ths,j,&u,&o,(INT)2);
5064  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5065  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5066  ip_w = ip_y - (R)(ip_u);
5067  for(l=0; l < 2*m+2; l++)
5068  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5069  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5070 
5071 #ifdef _OPENMP
5072  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5073 #else
5074  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5075 #endif
5076  }
5077  return;
5078  } /* if(PRE_LIN_PSI) */
5079 
5080  /* no precomputed psi at all */
5081  sort(ths);
5082 
5083 #ifdef _OPENMP
5084  MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5085 #endif
5086 
5087 #ifdef _OPENMP
5088  #pragma omp parallel for default(shared) private(k)
5089 #endif
5090  for (k = 0; k < M; k++)
5091  {
5092  INT u,o,l;
5093  R psij_const[3*(2*m+2)];
5094  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5095 
5096  uo(ths,j,&u,&o,(INT)0);
5097  for(l=0;l<=2*m+1;l++)
5098  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5099 
5100  uo(ths,j,&u,&o,(INT)1);
5101  for(l=0;l<=2*m+1;l++)
5102  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5103 
5104  uo(ths,j,&u,&o,(INT)2);
5105  for(l=0;l<=2*m+1;l++)
5106  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5107 
5108 #ifdef _OPENMP
5109  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5110 #else
5111  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5112 #endif
5113  }
5114 }
5115 
5116 
5117 void X(trafo_3d)(X(plan) *ths)
5118 {
5119  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5120  C *g_hat,*f_hat;
5121  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5122  R ck01, ck02, ck11, ck12, ck21, ck22;
5123  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5124  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5125 
5126  ths->g_hat=ths->g1;
5127  ths->g=ths->g2;
5128 
5129  N0=ths->N[0];
5130  N1=ths->N[1];
5131  N2=ths->N[2];
5132  n0=ths->n[0];
5133  n1=ths->n[1];
5134  n2=ths->n[2];
5135 
5136  f_hat=(C*)ths->f_hat;
5137  g_hat=(C*)ths->g_hat;
5138 
5139  TIC(0)
5140 #ifdef _OPENMP
5141  #pragma omp parallel for default(shared) private(k0)
5142  for (k0 = 0; k0 < ths->n_total; k0++)
5143  ths->g_hat[k0] = 0.0;
5144 #else
5145  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
5146 #endif
5147 
5148  if(ths->flags & PRE_PHI_HUT)
5149  {
5150  c_phi_inv01=ths->c_phi_inv[0];
5151  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5152 
5153 #ifdef _OPENMP
5154  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5155 #endif
5156  for(k0=0;k0<N0/2;k0++)
5157  {
5158  ck01=c_phi_inv01[k0];
5159  ck02=c_phi_inv02[k0];
5160  c_phi_inv11=ths->c_phi_inv[1];
5161  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5162 
5163  for(k1=0;k1<N1/2;k1++)
5164  {
5165  ck11=c_phi_inv11[k1];
5166  ck12=c_phi_inv12[k1];
5167  c_phi_inv21=ths->c_phi_inv[2];
5168  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5169 
5170  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5171  f_hat111=f_hat + (k0*N1+k1)*N2;
5172  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5173  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5174  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5175  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5176  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5177  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5178 
5179  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5180  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5181  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5182  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5183  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5184  f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5185  g_hat222=g_hat + (k0*n1+k1)*n2;
5186  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5187 
5188  for(k2=0;k2<N2/2;k2++)
5189  {
5190  ck21=c_phi_inv21[k2];
5191  ck22=c_phi_inv22[k2];
5192 
5193  g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5194  g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5195  g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5196  g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5197 
5198  g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5199  g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5200  g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5201  g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5202  }
5203  }
5204  }
5205  }
5206  else
5207 #ifdef _OPENMP
5208  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5209 #endif
5210  for(k0=0;k0<N0/2;k0++)
5211  {
5212  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5213  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5214  for(k1=0;k1<N1/2;k1++)
5215  {
5216  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5217  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5218 
5219  for(k2=0;k2<N2/2;k2++)
5220  {
5221  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5222  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5223 
5224  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5225  g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5226  g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5227  g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5228 
5229  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5230  g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5231  g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5232  g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5233  }
5234  }
5235  }
5236 
5237  TOC(0)
5238 
5239  TIC_FFTW(1)
5240  FFTW(execute)(ths->my_fftw_plan1);
5241  TOC_FFTW(1);
5242 
5243  TIC(2);
5244  nfft_trafo_3d_B(ths);
5245  TOC(2);
5246 }
5247 
5248 void X(adjoint_3d)(X(plan) *ths)
5249 {
5250  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5251  C *g_hat,*f_hat;
5252  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5253  R ck01, ck02, ck11, ck12, ck21, ck22;
5254  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5255  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5256 
5257  ths->g_hat=ths->g1;
5258  ths->g=ths->g2;
5259 
5260  N0=ths->N[0];
5261  N1=ths->N[1];
5262  N2=ths->N[2];
5263  n0=ths->n[0];
5264  n1=ths->n[1];
5265  n2=ths->n[2];
5266 
5267  f_hat=(C*)ths->f_hat;
5268  g_hat=(C*)ths->g_hat;
5269 
5270  TIC(2);
5271  nfft_adjoint_3d_B(ths);
5272  TOC(2);
5273 
5274  TIC_FFTW(1)
5275  FFTW(execute)(ths->my_fftw_plan2);
5276  TOC_FFTW(1);
5277 
5278  TIC(0)
5279  if(ths->flags & PRE_PHI_HUT)
5280  {
5281  c_phi_inv01=ths->c_phi_inv[0];
5282  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5283 
5284 #ifdef _OPENMP
5285  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5286 #endif
5287  for(k0=0;k0<N0/2;k0++)
5288  {
5289  ck01=c_phi_inv01[k0];
5290  ck02=c_phi_inv02[k0];
5291  c_phi_inv11=ths->c_phi_inv[1];
5292  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5293 
5294  for(k1=0;k1<N1/2;k1++)
5295  {
5296  ck11=c_phi_inv11[k1];
5297  ck12=c_phi_inv12[k1];
5298  c_phi_inv21=ths->c_phi_inv[2];
5299  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5300 
5301  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5302  f_hat111=f_hat + (k0*N1+k1)*N2;
5303  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5304  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5305  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5306  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5307  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5308  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5309 
5310  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5311  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5312  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5313  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5314  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5315  f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5316  g_hat222=g_hat + (k0*n1+k1)*n2;
5317  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5318 
5319  for(k2=0;k2<N2/2;k2++)
5320  {
5321  ck21=c_phi_inv21[k2];
5322  ck22=c_phi_inv22[k2];
5323 
5324  f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5325  f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5326  f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5327  f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5328 
5329  f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5330  f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5331  f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5332  f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5333  }
5334  }
5335  }
5336  }
5337  else
5338 #ifdef _OPENMP
5339  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5340 #endif
5341  for(k0=0;k0<N0/2;k0++)
5342  {
5343  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5344  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5345  for(k1=0;k1<N1/2;k1++)
5346  {
5347  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5348  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5349 
5350  for(k2=0;k2<N2/2;k2++)
5351  {
5352  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5353  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5354 
5355  f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5356  f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5357  f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5358  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5359 
5360  f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5361  f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5362  f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5363  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5364  }
5365  }
5366  }
5367 
5368  TOC(0)
5369 }
5370 
5373 void X(trafo)(X(plan) *ths)
5374 {
5375  switch(ths->d)
5376  {
5377  case 1: X(trafo_1d)(ths); break;
5378  case 2: X(trafo_2d)(ths); break;
5379  case 3: X(trafo_3d)(ths); break;
5380  default:
5381  {
5382  /* use ths->my_fftw_plan1 */
5383  ths->g_hat = ths->g1;
5384  ths->g = ths->g2;
5385 
5389  TIC(0)
5390  D_A(ths);
5391  TOC(0)
5392 
5397  TIC_FFTW(1)
5398  FFTW(execute)(ths->my_fftw_plan1);
5399  TOC_FFTW(1)
5400 
5404  TIC(2)
5405  B_A(ths);
5406  TOC(2)
5407  }
5408  }
5409 } /* nfft_trafo */
5410 
5411 void X(adjoint)(X(plan) *ths)
5412 {
5413  switch(ths->d)
5414  {
5415  case 1: X(adjoint_1d)(ths); break;
5416  case 2: X(adjoint_2d)(ths); break;
5417  case 3: X(adjoint_3d)(ths); break;
5418  default:
5419  {
5420  /* use ths->my_fftw_plan2 */
5421  ths->g_hat=ths->g1;
5422  ths->g=ths->g2;
5423 
5427  TIC(2)
5428  B_T(ths);
5429  TOC(2)
5430 
5435  TIC_FFTW(1)
5436  FFTW(execute)(ths->my_fftw_plan2);
5437  TOC_FFTW(1)
5438 
5442  TIC(0)
5443  D_T(ths);
5444  TOC(0)
5445  }
5446  }
5447 } /* nfft_adjoint */
5448 
5449 
5452 static void precompute_phi_hut(X(plan) *ths)
5453 {
5454  INT ks[ths->d]; /* index over all frequencies */
5455  INT t; /* index over all dimensions */
5456 
5457  ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
5458 
5459  for (t = 0; t < ths->d; t++)
5460  {
5461  ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R));
5462 
5463  for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5464  {
5465  ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5466  }
5467  }
5468 } /* nfft_phi_hut */
5469 
5474 void X(precompute_lin_psi)(X(plan) *ths)
5475 {
5476  INT t;
5477  INT j;
5478  R step;
5480  for (t=0; t<ths->d; t++)
5481  {
5482  step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5483  for(j = 0;j <= ths->K; j++)
5484  {
5485  ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5486  } /* for(j) */
5487  } /* for(t) */
5488 }
5489 
5490 void X(precompute_fg_psi)(X(plan) *ths)
5491 {
5492  INT t;
5493  INT u, o;
5495  sort(ths);
5496 
5497  for (t=0; t<ths->d; t++)
5498  {
5499  INT j;
5500 #ifdef _OPENMP
5501  #pragma omp parallel for default(shared) private(j,u,o)
5502 #endif
5503  for (j = 0; j < ths->M_total; j++)
5504  {
5505  uo(ths,j,&u,&o,t);
5506 
5507  ths->psi[2*(j*ths->d+t)]=
5508  (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5509 
5510  ths->psi[2*(j*ths->d+t)+1]=
5511  EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5512  } /* for(j) */
5513  }
5514  /* for(t) */
5515 } /* nfft_precompute_fg_psi */
5516 
5517 void X(precompute_psi)(X(plan) *ths)
5518 {
5519  INT t; /* index over all dimensions */
5520  INT l; /* index u<=l<=o */
5521  INT lj; /* index 0<=lj<u+o+1 */
5522  INT u, o; /* depends on x_j */
5523 
5524  sort(ths);
5525 
5526  for (t=0; t<ths->d; t++)
5527  {
5528  INT j;
5529 #ifdef _OPENMP
5530  #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5531 #endif
5532  for (j = 0; j < ths->M_total; j++)
5533  {
5534  uo(ths,j,&u,&o,t);
5535 
5536  for(l = u, lj = 0; l <= o; l++, lj++)
5537  ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5538  (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5539  } /* for(j) */
5540  }
5541  /* for(t) */
5542 } /* nfft_precompute_psi */
5543 
5544 #ifdef _OPENMP
5545 static void nfft_precompute_full_psi_omp(X(plan) *ths)
5546 {
5547  INT j;
5548  INT lprod;
5550  {
5551  INT t;
5552  for(t=0,lprod = 1; t<ths->d; t++)
5553  lprod *= 2*ths->m+2;
5554  }
5555 
5556  #pragma omp parallel for default(shared) private(j)
5557  for(j=0; j<ths->M_total; j++)
5558  {
5559  INT t,t2;
5560  INT l_L;
5561  INT l[ths->d];
5562  INT lj[ths->d];
5563  INT ll_plain[ths->d+1];
5565  INT u[ths->d], o[ths->d];
5567  R phi_prod[ths->d+1];
5568  INT ix = j*lprod;
5569 
5570  phi_prod[0]=1;
5571  ll_plain[0]=0;
5572 
5573  MACRO_init_uo_l_lj_t;
5574 
5575  for(l_L=0; l_L<lprod; l_L++, ix++)
5576  {
5577  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5578 
5579  ths->psi_index_g[ix]=ll_plain[ths->d];
5580  ths->psi[ix]=phi_prod[ths->d];
5581 
5582  MACRO_count_uo_l_lj_t;
5583  } /* for(l_L) */
5584 
5585  ths->psi_index_f[j]=lprod;
5586  } /* for(j) */
5587 }
5588 #endif
5589 
5590 void X(precompute_full_psi)(X(plan) *ths)
5591 {
5592 #ifdef _OPENMP
5593  sort(ths);
5594 
5595  nfft_precompute_full_psi_omp(ths);
5596 #else
5597  INT t, t2; /* index over all dimensions */
5598  INT j; /* index over all nodes */
5599  INT l_L; /* plain index 0 <= l_L < lprod */
5600  INT l[ths->d]; /* multi index u<=l<=o */
5601  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
5602  INT ll_plain[ths->d+1]; /* postfix plain index */
5603  INT lprod; /* 'bandwidth' of matrix B */
5604  INT u[ths->d], o[ths->d]; /* depends on x_j */
5605 
5606  R phi_prod[ths->d+1];
5607 
5608  INT ix, ix_old;
5609 
5610  sort(ths);
5611 
5612  phi_prod[0] = K(1.0);
5613  ll_plain[0] = 0;
5614 
5615  for (t = 0, lprod = 1; t < ths->d; t++)
5616  lprod *= 2 * ths->m + 2;
5617 
5618  for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5619  {
5620  MACRO_init_uo_l_lj_t;
5621 
5622  for (l_L = 0; l_L < lprod; l_L++, ix++)
5623  {
5624  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5625 
5626  ths->psi_index_g[ix] = ll_plain[ths->d];
5627  ths->psi[ix] = phi_prod[ths->d];
5628 
5629  MACRO_count_uo_l_lj_t;
5630  } /* for(l_L) */
5631 
5632  ths->psi_index_f[j] = ix - ix_old;
5633  ix_old = ix;
5634  } /* for(j) */
5635 #endif
5636 }
5637 
5638 void X(precompute_one_psi)(X(plan) *ths)
5639 {
5640  if(ths->flags & PRE_LIN_PSI)
5641  X(precompute_lin_psi)(ths);
5642  if(ths->flags & PRE_FG_PSI)
5643  X(precompute_fg_psi)(ths);
5644  if(ths->flags & PRE_PSI)
5645  X(precompute_psi)(ths);
5646  if(ths->flags & PRE_FULL_PSI)
5647  X(precompute_full_psi)(ths);
5648 }
5649 
5650 static void init_help(X(plan) *ths)
5651 {
5652  INT t; /* index over all dimensions */
5653  INT lprod; /* 'bandwidth' of matrix B */
5654 
5655  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5656  ths->flags |= NFFT_SORT_NODES;
5657 
5658  ths->N_total = intprod(ths->N, 0, ths->d);
5659  ths->n_total = intprod(ths->n, 0, ths->d);
5660 
5661  ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R));
5662 
5663  for(t = 0;t < ths->d; t++)
5664  ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5665 
5666  WINDOW_HELP_INIT;
5667 
5668  if(ths->flags & MALLOC_X)
5669  ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
5670 
5671  if(ths->flags & MALLOC_F_HAT)
5672  ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C));
5673 
5674  if(ths->flags & MALLOC_F)
5675  ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C));
5676 
5677  if(ths->flags & PRE_PHI_HUT)
5678  precompute_phi_hut(ths);
5679 
5680  if (ths->flags & PRE_LIN_PSI)
5681  {
5682  if (ths->K == 0)
5683  {
5684  ths->K = Y(m2K)(ths->m);
5685  }
5686  ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R));
5687  }
5688 
5689  if(ths->flags & PRE_FG_PSI)
5690  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
5691 
5692  if(ths->flags & PRE_PSI)
5693  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R));
5694 
5695  if(ths->flags & PRE_FULL_PSI)
5696  {
5697  for (t = 0, lprod = 1; t < ths->d; t++)
5698  lprod *= 2 * ths->m + 2;
5699 
5700  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
5701 
5702  ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
5703  ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
5704  }
5705 
5706  if(ths->flags & FFTW_INIT)
5707  {
5708 #ifdef _OPENMP
5709  INT nthreads = Y(get_num_threads)();
5710 #endif
5711 
5712  ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C));
5713 
5714  if(ths->flags & FFT_OUT_OF_PLACE)
5715  ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C));
5716  else
5717  ths->g2 = ths->g1;
5718 
5719 #ifdef _OPENMP
5720 #pragma omp critical (nfft_omp_critical_fftw_plan)
5721 {
5722  FFTW(plan_with_nthreads)(nthreads);
5723 #endif
5724  {
5725  int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
5726 
5727  for (t = 0; t < ths->d; t++)
5728  _n[t] = (int)(ths->n[t]);
5729 
5730  ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5731  ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
5732  Y(free)(_n);
5733  }
5734 #ifdef _OPENMP
5735 }
5736 #endif
5737  }
5738 
5739  if(ths->flags & NFFT_SORT_NODES)
5740  ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total));
5741  else
5742  ths->index_x = NULL;
5743 
5744  ths->mv_trafo = (void (*) (void* ))X(trafo);
5745  ths->mv_adjoint = (void (*) (void* ))X(adjoint);
5746 }
5747 
5748 void X(init)(X(plan) *ths, int d, int *N, int M_total)
5749 {
5750  INT t; /* index over all dimensions */
5751 
5752  ths->d = (INT)d;
5753 
5754  ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
5755 
5756  for (t = 0; t < d; t++)
5757  ths->N[t] = (INT)N[t];
5758 
5759  ths->M_total = (INT)M_total;
5760 
5761  ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
5762 
5763  for (t = 0; t < d; t++)
5764  ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
5765 
5766  ths->m = WINDOW_HELP_ESTIMATE_m;
5767 
5768  if (d > 1)
5769  {
5770 #ifdef _OPENMP
5771  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5772  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5773  NFFT_OMP_BLOCKWISE_ADJOINT;
5774 #else
5775  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5776  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5777 #endif
5778  }
5779  else
5780  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5781  FFTW_INIT | FFT_OUT_OF_PLACE;
5782 
5783  ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5784 
5785  ths->K = 0;
5786  init_help(ths);
5787 }
5788 
5789 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
5790  unsigned flags, unsigned fftw_flags)
5791 {
5792  INT t; /* index over all dimensions */
5793 
5794  ths->d = (INT)d;
5795  ths->M_total = (INT)M_total;
5796  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5797 
5798  for (t = 0; t < d; t++)
5799  ths->N[t] = (INT)N[t];
5800 
5801  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5802 
5803  for (t = 0; t < d; t++)
5804  ths->n[t] = (INT)n[t];
5805 
5806  ths->m = (INT)m;
5807 
5808  ths->flags = flags;
5809  ths->fftw_flags = fftw_flags;
5810 
5811  ths->K = 0;
5812  init_help(ths);
5813 }
5814 
5815 void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K,
5816  unsigned flags, unsigned fftw_flags)
5817 {
5818  INT t; /* index over all dimensions */
5819 
5820  ths->d = (INT)d;
5821  ths->M_total = (INT)M_total;
5822  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5823 
5824  for (t = 0; t < d; t++)
5825  ths->N[t] = (INT)N[t];
5826 
5827  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5828 
5829  for (t = 0; t < d; t++)
5830  ths->n[t] = (INT)n[t];
5831 
5832  ths->m = (INT)m;
5833 
5834  ths->flags = flags;
5835  ths->fftw_flags = fftw_flags;
5836 
5837  ths->K = K;
5838  init_help(ths);
5839 }
5840 
5841 void X(init_1d)(X(plan) *ths, int N1, int M_total)
5842 {
5843  int N[1];
5844 
5845  N[0] = N1;
5846 
5847  X(init)(ths, 1, N, M_total);
5848 }
5849 
5850 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
5851 {
5852  int N[2];
5853 
5854  N[0] = N1;
5855  N[1] = N2;
5856  X(init)(ths, 2, N, M_total);
5857 }
5858 
5859 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
5860 {
5861  int N[3];
5862 
5863  N[0] = N1;
5864  N[1] = N2;
5865  N[2] = N3;
5866  X(init)(ths, 3, N, M_total);
5867 }
5868 
5869 const char* X(check)(X(plan) *ths)
5870 {
5871  INT j;
5872 
5873  if (!ths->f)
5874  return "Member f not initialized.";
5875 
5876  if (!ths->x)
5877  return "Member x not initialized.";
5878 
5879  if (!ths->f_hat)
5880  return "Member f_hat not initialized.";
5881 
5882  if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
5883  return "Number of nodes too small to use PRE_LIN_PSI.";
5884 
5885  for (j = 0; j < ths->M_total * ths->d; j++)
5886  {
5887  if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5888  {
5889  return "ths->x out of range [-0.5,0.5)";
5890  }
5891  }
5892 
5893  for (j = 0; j < ths->d; j++)
5894  {
5895  if (ths->sigma[j] <= 1)
5896  return "Oversampling factor too small";
5897 
5898  if(ths->N[j] <= ths->m)
5899  return "Polynomial degree N is smaller than cut-off m";
5900 
5901  if(ths->N[j]%2 == 1)
5902  return "polynomial degree N has to be even";
5903  }
5904  return 0;
5905 }
5906 
5907 void X(finalize)(X(plan) *ths)
5908 {
5909  INT t; /* index over dimensions */
5910 
5911  if(ths->flags & NFFT_SORT_NODES)
5912  Y(free)(ths->index_x);
5913 
5914  if(ths->flags & FFTW_INIT)
5915  {
5916 #ifdef _OPENMP
5917  #pragma omp critical (nfft_omp_critical_fftw_plan)
5918 #endif
5919  FFTW(destroy_plan)(ths->my_fftw_plan2);
5920 #ifdef _OPENMP
5921  #pragma omp critical (nfft_omp_critical_fftw_plan)
5922 #endif
5923  FFTW(destroy_plan)(ths->my_fftw_plan1);
5924 
5925  if(ths->flags & FFT_OUT_OF_PLACE)
5926  Y(free)(ths->g2);
5927 
5928  Y(free)(ths->g1);
5929  }
5930 
5931  if(ths->flags & PRE_FULL_PSI)
5932  {
5933  Y(free)(ths->psi_index_g);
5934  Y(free)(ths->psi_index_f);
5935  Y(free)(ths->psi);
5936  }
5937 
5938  if(ths->flags & PRE_PSI)
5939  Y(free)(ths->psi);
5940 
5941  if(ths->flags & PRE_FG_PSI)
5942  Y(free)(ths->psi);
5943 
5944  if(ths->flags & PRE_LIN_PSI)
5945  Y(free)(ths->psi);
5946 
5947  if(ths->flags & PRE_PHI_HUT)
5948  {
5949  for (t = 0; t < ths->d; t++)
5950  Y(free)(ths->c_phi_inv[t]);
5951  Y(free)(ths->c_phi_inv);
5952  }
5953 
5954  if(ths->flags & MALLOC_F)
5955  Y(free)(ths->f);
5956 
5957  if(ths->flags & MALLOC_F_HAT)
5958  Y(free)(ths->f_hat);
5959 
5960  if(ths->flags & MALLOC_X)
5961  Y(free)(ths->x);
5962 
5963  WINDOW_HELP_FINALIZE;
5964 
5965  Y(free)(ths->sigma);
5966  Y(free)(ths->n);
5967  Y(free)(ths->N);
5968 }
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
Definition: infft.h:1415
#define X(name)
Include header for C99 complex datatype.
Definition: fastsum.h:53
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition: infft.h:1383