Hello,
This issue might be related to #294.
3.3.10
I experienced what seems to be a bug when compiling with --generic-256simd
option. I obtain different results when running in sequential and when running with multiple threads (openmp).
Identical results whatever the number of threads.
Here is a piece of code that allows to reproduce the problem. I put identical data into data1 and data2 and perform a sequential transform on the first, and a 4 threads transform on the the second. Then I get the indices where results are differents and print the corresponding values:
#include <array>
#include <cassert>
#include <complex>
#include <cstddef>
#include <fftw3.h>
#include <filesystem>
#include <functional>
#include <iostream>
#include <numeric>
#include <omp.h>
#include <random>
#include <vector>
using namespace std;
fftw_complex *init_rand_data(int n)
{
uniform_real_distribution<double> distrib(0.0, 1.0);
mt19937 engine;
auto gen = [&distrib, &engine]() { return distrib(engine); };
fftw_complex *res;
res = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * n);
for (size_t i = 0; i < n; i++)
res[i][0] = gen();
return res;
}
vector<size_t> get_diff_indices(const fftw_complex *v1, const fftw_complex *v2,
int n)
{
vector<size_t> is{};
for (size_t i = 0; i < n; i++)
{
if ((v1[i][0] != v2[i][0]) || (v1[i][1] != v2[i][1]))
is.push_back(i);
}
return is;
}
void diff_print(const vector<size_t> &i_diff, const fftw_complex *d1,
const fftw_complex *d2, int n)
{
for (auto const i : i_diff)
{
cout << "data1[" << i << "]=" << d1[i][0] << "," << d1[i][1]
<< " data2[" << i << "]=" << d2[i][0] << "," << d2[i][1] << "\n";
}
}
int main(int argc, char *argv[])
{
fftw_init_threads();
const array<int, 3> N_global{256, 256, 256};
const int N = N_global[0] * N_global[1] * N_global[2];
cout << "Domain size: " << N_global[0] << "*" << N_global[1] << "*"
<< N_global[2] << "\n";
auto data1 = init_rand_data(N);
fftw_complex *data2;
data2 = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
for (size_t i = 0; i < N; i++)
{
data2[i][0] = data1[i][0];
data2[i][1] = data1[i][1];
}
cout << "data generated" << endl;
/** 1 thread case */
int nth1 = 1;
fftw_plan_with_nthreads(nth1);
auto p1 = fftw_plan_dft(3, &N_global[0], data1, data1, -1, FFTW_ESTIMATE);
cout << "Running with " << nth1 << "thread(s)\n ";
fftw_execute(p1);
/** n threads case */
int nth2 = 4;
fftw_plan_with_nthreads(nth2);
auto p2 = fftw_plan_dft(3, &N_global[0], data2, data2, -1, FFTW_ESTIMATE);
cout << "Running with " << nth2 << "thread(s)\n ";
fftw_execute(p2);
/** For debug */
// data1[76][0] = 12.0;
// data1[98][0] = -1.0;
/** Differences between data1 and data2*/
auto diff = get_diff_indices(data1, data2, N);
/** Print differences by process */
cout << "Differences:\n ";
diff_print(diff, data1, data2, N);
cout << "End" << endl;
fftw_destroy_plan(p1);
fftw_free(data1);
fftw_destroy_plan(p2);
fftw_cleanup_threads();
}
Here is a sample of the output with the ``--generic-256simd` option:
Domain size: 256*256*256
data generated
Running with 1thread(s)
Running with 4thread(s)
Differences:
data1[16384]=315.297,235.74 data2[16384]=960.016,505.978
data1[32768]=1983.18,1.27898e-13 data2[32768]=-134.363,-93.4117
data1[49152]=315.297,-235.74 data2[49152]=529.096,-620.237
data1[81920]=-22.5832,-181.609 data2[81920]=435.089,313.97
data1[98304]=-606.381,233.907 data2[98304]=170.796,-30.2235
data1[114688]=105.228,149.729 data2[114688]=-174.098,-249.013
data1[147456]=-952.618,-164.618 data2[147456]=-2512.8,214.681
data1[163840]=-4191.25,-735.789 data2[163840]=-405.033,-1910
....
and the output without the--generic-256simd
option:
Domain size: 256*256*256
data generated
Running with 1thread(s)
Running with 4thread(s)
Differences:
End
I tried smaller domain sizes (128³, 64³ ...) and the problem is not visible.
Best,
Simon