837    std::vector<fctdata> fact;
 
  839    void add_factor(
size_t factor)
 
  840      { fact.push_back({
factor, 
nullptr, 
nullptr}); }
 
  842template<
bool fwd, 
typename T> 
void pass2 (
size_t ido, 
size_t l1,
 
  846  auto CH = [
ch,
ido,
l1](
size_t a, 
size_t b, 
size_t c) -> 
T&
 
  848  auto CC = [
cc,
ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
  849    { 
return cc[
a+
ido*(b+2*c)]; };
 
  850  auto WA = [
wa, 
ido](
size_t x, 
size_t i)
 
  851    { 
return wa[i-1+
x*(
ido-1)]; };
 
  854    for (
size_t k=0; 
k<
l1; ++
k)
 
  860    for (
size_t k=0; 
k<
l1; ++
k)
 
  864      for (
size_t i=1; i<
ido; ++i)
 
  872#define POCKETFFT_PREP3(idx) \ 
  873        T t0 = CC(idx,0,k), t1, t2; \ 
  874        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ 
 
  876#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ 
  879        T cb{-t2.i*twi, t2.r*twi}; \ 
  880        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ 
 
  882#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ 
  885        T cb{-t2.i*twi, t2.r*twi}; \ 
  886        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ 
  887        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ 
 
  889template<
bool fwd, 
typename T> 
void pass3 (
size_t ido, 
size_t l1,
 
  893  constexpr T0 tw1r=-0.5,
 
  894               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
 
  896  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
  897    { 
return ch[a+ido*(b+l1*c)]; };
 
  898  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
  899    { 
return cc[a+ido*(b+3*c)]; };
 
  900  auto WA = [wa, ido](
size_t x, 
size_t i)
 
  901    { 
return wa[i-1+x*(ido-1)]; };
 
  904    for (
size_t k=0; k<l1; ++k)
 
  910    for (
size_t k=0; k<l1; ++k)
 
  916      for (
size_t i=1; i<ido; ++i)
 
  924#undef POCKETFFT_PARTSTEP3b 
  925#undef POCKETFFT_PARTSTEP3a 
  926#undef POCKETFFT_PREP3 
  928template<
bool fwd, 
typename T> 
void pass4 (
size_t ido, 
size_t l1,
 
  932  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
  933    { 
return ch[a+ido*(b+l1*c)]; };
 
  934  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
  935    { 
return cc[a+ido*(b+4*c)]; };
 
  936  auto WA = [wa, ido](
size_t x, 
size_t i)
 
  937    { 
return wa[i-1+x*(ido-1)]; };
 
  940    for (
size_t k=0; k<l1; ++k)
 
  943      PM(t2,t1,CC(0,0,k),CC(0,2,k));
 
  944      PM(t3,t4,CC(0,1,k),CC(0,3,k));
 
  946      PM(CH(0,k,0),CH(0,k,2),t2,t3);
 
  947      PM(CH(0,k,1),CH(0,k,3),t1,t4);
 
  950    for (
size_t k=0; k<l1; ++k)
 
  954      PM(t2,t1,CC(0,0,k),CC(0,2,k));
 
  955      PM(t3,t4,CC(0,1,k),CC(0,3,k));
 
  957      PM(CH(0,k,0),CH(0,k,2),t2,t3);
 
  958      PM(CH(0,k,1),CH(0,k,3),t1,t4);
 
  960      for (
size_t i=1; i<ido; ++i)
 
  963        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
 
  968        special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
 
  969        special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
 
  970        special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
 
  975#define POCKETFFT_PREP5(idx) \ 
  976        T t0 = CC(idx,0,k), t1, t2, t3, t4; \ 
  977        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ 
  978        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ 
  979        CH(idx,k,0).r=t0.r+t1.r+t2.r; \ 
  980        CH(idx,k,0).i=t0.i+t1.i+t2.i; 
 
  982#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ 
  985        ca.r=t0.r+twar*t1.r+twbr*t2.r; \ 
  986        ca.i=t0.i+twar*t1.i+twbr*t2.i; \ 
  987        cb.i=twai*t4.r twbi*t3.r; \ 
  988        cb.r=-(twai*t4.i twbi*t3.i); \ 
  989        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ 
 
  992#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ 
  995        ca.r=t0.r+twar*t1.r+twbr*t2.r; \ 
  996        ca.i=t0.i+twar*t1.i+twbr*t2.i; \ 
  997        cb.i=twai*t4.r twbi*t3.r; \ 
  998        cb.r=-(twai*t4.i twbi*t3.i); \ 
  999        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ 
 1000        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ 
 
 1002template<
bool fwd, 
typename T> 
void pass5 (
size_t ido, 
size_t l1,
 
 1006  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
 
 1007               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
 
 1008               tw2r= T0(-0.8090169943749474241022934171828191L),
 
 1009               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
 
 1011  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1012    { 
return ch[a+ido*(b+l1*c)]; };
 
 1013  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1014    { 
return cc[a+ido*(b+5*c)]; };
 
 1015  auto WA = [wa, ido](
size_t x, 
size_t i)
 
 1016    { 
return wa[i-1+x*(ido-1)]; };
 
 1019    for (
size_t k=0; k<l1; ++k)
 
 1026    for (
size_t k=0; k<l1; ++k)
 
 1033      for (
size_t i=1; i<ido; ++i)
 
 1042#undef POCKETFFT_PARTSTEP5b 
 1043#undef POCKETFFT_PARTSTEP5a 
 1044#undef POCKETFFT_PREP5 
 1046#define POCKETFFT_PREP7(idx) \ 
 1047        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ 
 1048        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ 
 1049        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ 
 1050        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ 
 1051        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ 
 1052        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; 
 
 1054#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ 
 1057        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ 
 1058        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ 
 1059        cb.i=y1*t7.r y2*t6.r y3*t5.r; \ 
 1060        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ 
 1061        PM(out1,out2,ca,cb); \ 
 
 1063#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ 
 1064        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) 
 
 1065#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ 
 1068        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ 
 1069        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ 
 1070        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ 
 
 1073template<
bool fwd, 
typename T> 
void pass7(
size_t ido, 
size_t l1,
 
 1077  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
 
 1078               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
 
 1079               tw2r= T0(-0.2225209339563144042889025644967948L),
 
 1080               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
 
 1081               tw3r= T0(-0.9009688679024191262361023195074451L),
 
 1082               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
 
 1084  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1085    { 
return ch[a+ido*(b+l1*c)]; };
 
 1086  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1087    { 
return cc[a+ido*(b+7*c)]; };
 
 1088  auto WA = [wa, ido](
size_t x, 
size_t i)
 
 1089    { 
return wa[i-1+x*(ido-1)]; };
 
 1092    for (
size_t k=0; k<l1; ++k)
 
 1100    for (
size_t k=0; k<l1; ++k)
 
 1108      for (
size_t i=1; i<ido; ++i)
 
 1118#undef POCKETFFT_PARTSTEP7 
 1119#undef POCKETFFT_PARTSTEP7a0 
 1120#undef POCKETFFT_PARTSTEP7a 
 1121#undef POCKETFFT_PREP7 
 1123template <
bool fwd, 
typename T> 
void ROTX45(T &a)
 const 
 1125  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
 
 1127    { 
auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
 
 1129    { 
auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
 
 1131template <
bool fwd, 
typename T> 
void ROTX135(T &a)
 const 
 1133  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
 
 1135    { 
auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
 
 1137    { 
auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
 
 1140template<
bool fwd, 
typename T> 
void pass8 (
size_t ido, 
size_t l1,
 
 1144  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1145    { 
return ch[a+ido*(b+l1*c)]; };
 
 1146  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1147    { 
return cc[a+ido*(b+8*c)]; };
 
 1148  auto WA = [wa, ido](
size_t x, 
size_t i)
 
 1149    { 
return wa[i-1+x*(ido-1)]; };
 
 1152    for (
size_t k=0; k<l1; ++k)
 
 1154      T a0, a1, a2, a3, a4, a5, a6, a7;
 
 1155      PM(a1,a5,CC(0,1,k),CC(0,5,k));
 
 1156      PM(a3,a7,CC(0,3,k),CC(0,7,k));
 
 1165      PM(a0,a4,CC(0,0,k),CC(0,4,k));
 
 1166      PM(a2,a6,CC(0,2,k),CC(0,6,k));
 
 1167      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
 
 1168      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
 
 1170      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
 
 1171      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
 
 1174    for (
size_t k=0; k<l1; ++k)
 
 1177      T a0, a1, a2, a3, a4, a5, a6, a7;
 
 1178      PM(a1,a5,CC(0,1,k),CC(0,5,k));
 
 1179      PM(a3,a7,CC(0,3,k),CC(0,7,k));
 
 1188      PM(a0,a4,CC(0,0,k),CC(0,4,k));
 
 1189      PM(a2,a6,CC(0,2,k),CC(0,6,k));
 
 1190      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
 
 1191      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
 
 1193      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
 
 1194      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
 
 1196      for (
size_t i=1; i<ido; ++i)
 
 1198        T a0, a1, a2, a3, a4, a5, a6, a7;
 
 1199        PM(a1,a5,CC(i,1,k),CC(i,5,k));
 
 1200        PM(a3,a7,CC(i,3,k),CC(i,7,k));
 
 1207        PM(a0,a4,CC(i,0,k),CC(i,4,k));
 
 1208        PM(a2,a6,CC(i,2,k),CC(i,6,k));
 
 1211        special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
 
 1212        special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
 
 1213        special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
 
 1216        special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
 
 1217        special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
 
 1218        special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
 
 1219        special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
 
 1225#define POCKETFFT_PREP11(idx) \ 
 1226        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ 
 1227        PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ 
 1228        PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ 
 1229        PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ 
 1230        PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ 
 1231        PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ 
 1232        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ 
 1233        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; 
 
 1235#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ 
 1237        T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ 
 1239        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ 
 1240        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ 
 1241        PM(out1,out2,ca,cb); \ 
 
 1243#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ 
 1244        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) 
 
 1245#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ 
 1248        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ 
 1249        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ 
 1250        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ 
 
 1253template<
bool fwd, 
typename T> 
void pass11 (
size_t ido, 
size_t l1,
 
 1257  constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
 
 1258               tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
 
 1259               tw2r= T0(0.4154150130018864255292741492296232L),
 
 1260               tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
 
 1261               tw3r= T0(-0.1423148382732851404437926686163697L),
 
 1262               tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
 
 1263               tw4r= T0(-0.6548607339452850640569250724662936L),
 
 1264               tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
 
 1265               tw5r= T0(-0.9594929736144973898903680570663277L),
 
 1266               tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
 
 1268  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1269    { 
return ch[a+ido*(b+l1*c)]; };
 
 1270  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1271    { 
return cc[a+ido*(b+11*c)]; };
 
 1272  auto WA = [wa, ido](
size_t x, 
size_t i)
 
 1273    { 
return wa[i-1+x*(ido-1)]; };
 
 1276    for (
size_t k=0; k<l1; ++k)
 
 1279      POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
 
 1280      POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
 
 1281      POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
 
 1282      POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
 
 1283      POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
 
 1286    for (
size_t k=0; k<l1; ++k)
 
 1290      POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
 
 1291      POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
 
 1292      POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
 
 1293      POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
 
 1294      POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
 
 1296      for (
size_t i=1; i<ido; ++i)
 
 1299        POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
 
 1300        POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
 
 1301        POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
 
 1302        POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
 
 1303        POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
 
 1308#undef POCKETFFT_PARTSTEP11 
 1309#undef POCKETFFT_PARTSTEP11a0 
 1310#undef POCKETFFT_PARTSTEP11a 
 1311#undef POCKETFFT_PREP11 
 1313template<
bool fwd, 
typename T> 
void passg (
size_t ido, 
size_t ip,
 
 1318  const size_t cdim=ip;
 
 1319  size_t ipph = (ip+1)/2;
 
 1320  size_t idl1 = ido*l1;
 
 1322  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1323    { 
return ch[a+ido*(b+l1*c)]; };
 
 1324  auto CC = [cc,ido,cdim](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1325    { 
return cc[a+ido*(b+cdim*c)]; };
 
 1326  auto CX = [cc, ido, l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1327    { 
return cc[a+ido*(b+l1*c)]; };
 
 1328  auto CX2 = [cc, idl1](
size_t a, 
size_t b) -> T&
 
 1329    { 
return cc[a+idl1*b]; };
 
 1330  auto CH2 = [ch, idl1](
size_t a, 
size_t b) -> 
const T&
 
 1331    { 
return ch[a+idl1*b]; };
 
 1333  arr<cmplx<T0>> wal(ip);
 
 1334  wal[0] = cmplx<T0>(1., 0.);
 
 1335  for (
size_t i=1; i<ip; ++i)
 
 1336    wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
 
 1338  for (
size_t k=0; k<l1; ++k)
 
 1339    for (
size_t i=0; i<ido; ++i)
 
 1340      CH(i,k,0) = CC(i,0,k);
 
 1341  for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
 
 1342    for (
size_t k=0; k<l1; ++k)
 
 1343      for (
size_t i=0; i<ido; ++i)
 
 1344        PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
 
 1345  for (
size_t k=0; k<l1; ++k)
 
 1346    for (
size_t i=0; i<ido; ++i)
 
 1349      for (
size_t j=1; j<ipph; ++j)
 
 1353  for (
size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
 
 1356    for (
size_t ik=0; ik<idl1; ++ik)
 
 1358      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
 
 1359      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
 
 1360      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
 
 1361      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
 
 1365    size_t j=3, jc=ip-3;
 
 1366    for (; j<ipph-1; j+=2, jc-=2)
 
 1368      iwal+=l; 
if (iwal>ip) iwal-=ip;
 
 1369      cmplx<T0> xwal=wal[iwal];
 
 1370      iwal+=l; 
if (iwal>ip) iwal-=ip;
 
 1371      cmplx<T0> xwal2=wal[iwal];
 
 1372      for (
size_t ik=0; ik<idl1; ++ik)
 
 1374        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
 
 1375        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
 
 1376        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
 
 1377        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
 
 1380    for (; j<ipph; ++j, --jc)
 
 1382      iwal+=l; 
if (iwal>ip) iwal-=ip;
 
 1383      cmplx<T0> xwal=wal[iwal];
 
 1384      for (
size_t ik=0; ik<idl1; ++ik)
 
 1386        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
 
 1387        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
 
 1388        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
 
 1389        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
 
 1396    for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
 
 1397      for (
size_t ik=0; ik<idl1; ++ik)
 
 1399        T t1=CX2(ik,j), t2=CX2(ik,jc);
 
 1400        PM(CX2(ik,j),CX2(ik,jc),t1,t2);
 
 1404    for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
 
 1405      for (
size_t k=0; k<l1; ++k)
 
 1407        T t1=CX(0,k,j), t2=CX(0,k,jc);
 
 1408        PM(CX(0,k,j),CX(0,k,jc),t1,t2);
 
 1409        for (
size_t i=1; i<ido; ++i)
 
 1412          PM(x1,x2,CX(i,k,j),CX(i,k,jc));
 
 1413          size_t idij=(j-1)*(ido-1)+i-1;
 
 1414          special_mul<fwd>(x1,wa[idij],CX(i,k,j));
 
 1415          idij=(jc-1)*(ido-1)+i-1;
 
 1416          special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
 
 1422template<
bool fwd, 
typename T> 
void pass_all(T c[], T0 fct)
 const 
 1424  if (length==1) { c[0]*=fct; 
return; }
 
 1427  T *p1=c, *p2=ch.data();
 
 1429  for(
size_t k1=0; k1<fact.size(); k1++)
 
 1431    size_t ip=fact[k1].fct;
 
 1433    size_t ido = length/l2;
 
 1435      pass4<fwd> (ido, l1, p1, p2, fact[k1].tw);
 
 1437      pass8<fwd>(ido, l1, p1, p2, fact[k1].tw);
 
 1439      pass2<fwd>(ido, l1, p1, p2, fact[k1].tw);
 
 1441      pass3<fwd> (ido, l1, p1, p2, fact[k1].tw);
 
 1443      pass5<fwd> (ido, l1, p1, p2, fact[k1].tw);
 
 1445      pass7<fwd> (ido, l1, p1, p2, fact[k1].tw);
 
 1447      pass11<fwd> (ido, l1, p1, p2, fact[k1].tw);
 
 1450      passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
 
 1459      for (
size_t i=0; i<length; ++i)
 
 1462      std::copy_n (p1, length, c);
 
 1466      for (
size_t i=0; i<length; ++i)
 
 1479        { add_factor(8); len>>=3; }
 
 1481        { add_factor(4); len>>=2; }
 
 1487        std::swap(fact[0].fct, fact.back().fct);
 
 1489      for (
size_t divisor=3; divisor*divisor<=len; divisor+=2)
 
 1490        while ((len%divisor)==0)
 
 1492          add_factor(divisor);
 
 1495      if (len>1) add_factor(len);
 
 1498    size_t twsize()
 const 
 1500      size_t twsize=0, l1=1;
 
 1501      for (
size_t k=0; k<fact.size(); ++k)
 
 1503        size_t ip=fact[k].fct, ido= length/(l1*ip);
 
 1504        twsize+=(ip-1)*(ido-1);
 
 1514      sincos_2pibyn<T0> twiddle(length);
 
 1517      for (
size_t k=0; k<fact.size(); ++k)
 
 1519        size_t ip=fact[k].fct, ido=length/(l1*ip);
 
 1520        fact[k].tw=mem.data()+memofs;
 
 1521        memofs+=(ip-1)*(ido-1);
 
 1522        for (
size_t j=1; j<ip; ++j)
 
 1523          for (
size_t i=1; i<ido; ++i)
 
 1524            fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
 
 1527          fact[k].tws=mem.data()+memofs;
 
 1529          for (
size_t j=0; j<ip; ++j)
 
 1530            fact[k].tws[j] = twiddle[j*l1*ido];
 
 1540      if (length==0) 
throw std::runtime_error(
"zero-length FFT requested");
 
 1541      if (length==1) 
return;
 
 1543      mem.resize(twsize());
 
 
 
 1563    std::vector<fctdata> fact;
 
 1565    void add_factor(
size_t factor)
 
 1566      { fact.push_back({
factor, 
nullptr, 
nullptr}); }
 
 1569template<
typename T1, 
typename T2, 
typename T3> 
inline void MULPM
 
 1571  {  
a=c*e+d*
f; b=c*
f-d*e; }
 
 1573template<
typename T> 
void radf2 (
size_t ido, 
size_t l1,
 
 1577  auto WA = [
wa,
ido](
size_t x, 
size_t i) { 
return wa[i+
x*(
ido-1)]; };
 
 1578  auto CC = [
cc,
ido,
l1](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1580  auto CH = [
ch,
ido](
size_t a, 
size_t b, 
size_t c) -> 
T&
 
 1581    { 
return ch[
a+
ido*(b+2*c)]; };
 
 1583  for (
size_t k=0; 
k<
l1; 
k++)
 
 1584    PM (
CH(0,0,
k),
CH(
ido-1,1,
k),
CC(0,
k,0),
CC(0,
k,1));
 
 1586    for (
size_t k=0; 
k<
l1; 
k++)
 
 1592  for (
size_t k=0; 
k<
l1; 
k++)
 
 1593    for (
size_t i=2; i<
ido; i+=2)
 
 1597      MULPM (
tr2,
ti2,
WA(0,i-2),
WA(0,i-1),
CC(i-1,
k,1),
CC(i,
k,1));
 
 1604#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \ 
 1606  auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \ 
 1607  rx=t1; ix=t3; ry=t4; iy=t2; \ 
 
 1610template<
typename T> 
void radf3(
size_t ido, 
size_t l1,
 
 1614  constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
 
 1616  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 1617  auto CC = [cc,ido,l1](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1618    { 
return cc[a+ido*(b+l1*c)]; };
 
 1619  auto CH = [ch,ido](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1620    { 
return ch[a+ido*(b+3*c)]; };
 
 1622  for (
size_t k=0; k<l1; k++)
 
 1624    T cr2=CC(0,k,1)+CC(0,k,2);
 
 1625    CH(0,0,k) = CC(0,k,0)+cr2;
 
 1626    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
 
 1627    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
 
 1630  for (
size_t k=0; k<l1; k++)
 
 1631    for (
size_t i=2; i<ido; i+=2)
 
 1634      T di2, di3, dr2, dr3;
 
 1635      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 
 
 1636      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); 
 
 1638      CH(i-1,0,k) = CC(i-1,k,0)+dr2; 
 
 1639      CH(i  ,0,k) = CC(i  ,k,0)+di2;
 
 1640      T tr2 = CC(i-1,k,0)+taur*dr2; 
 
 1641      T ti2 = CC(i  ,k,0)+taur*di2;
 
 1644      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); 
 
 1645      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2); 
 
 1649template<
typename T> 
void radf4(
size_t ido, 
size_t l1,
 
 1653  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
 
 1655  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 1656  auto CC = [cc,ido,l1](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1657    { 
return cc[a+ido*(b+l1*c)]; };
 
 1658  auto CH = [ch,ido](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1659    { 
return ch[a+ido*(b+4*c)]; };
 
 1661  for (
size_t k=0; k<l1; k++)
 
 1664    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
 
 1665    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
 
 1666    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
 
 1669    for (
size_t k=0; k<l1; k++)
 
 1671      T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
 
 1672      T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
 
 1673      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
 
 1674      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2));
 
 1677  for (
size_t k=0; k<l1; k++)
 
 1678    for (
size_t i=2; i<ido; i+=2)
 
 1681      T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
 
 1682      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
 
 1683      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
 
 1684      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
 
 1685      PM(tr1,tr4,cr4,cr2);
 
 1686      PM(ti1,ti4,ci2,ci4);
 
 1687      PM(tr2,tr3,CC(i-1,k,0),cr3);
 
 1688      PM(ti2,ti3,CC(i  ,k,0),ci3);
 
 1689      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
 
 1690      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2);
 
 1691      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
 
 1692      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3);
 
 1696template<
typename T> 
void radf5(
size_t ido, 
size_t l1,
 
 1700  constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
 
 1701               ti11= T0(0.9510565162951535721164393333793821L),
 
 1702               tr12= T0(-0.8090169943749474241022934171828191L),
 
 1703               ti12= T0(0.5877852522924731291687059546390728L);
 
 1705  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 1706  auto CC = [cc,ido,l1](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1707    { 
return cc[a+ido*(b+l1*c)]; };
 
 1708  auto CH = [ch,ido](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1709    { 
return ch[a+ido*(b+5*c)]; };
 
 1711  for (
size_t k=0; k<l1; k++)
 
 1713    T cr2, cr3, ci4, ci5;
 
 1714    PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
 
 1715    PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
 
 1716    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
 
 1717    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
 
 1718    CH(0,2,k)=ti11*ci5+ti12*ci4;
 
 1719    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
 
 1720    CH(0,4,k)=ti12*ci5-ti11*ci4;
 
 1723  for (
size_t k=0; k<l1;++k)
 
 1724    for (
size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
 
 1726      T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
 
 1727      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
 
 1728      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
 
 1729      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
 
 1730      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
 
 1733      CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
 
 1734      CH(i  ,0,k)=CC(i  ,k,0)+di2+di3;
 
 1735      T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
 
 1736      T ti2=CC(i  ,k,0)+tr11*di2+tr12*di3;
 
 1737      T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
 
 1738      T ti3=CC(i  ,k,0)+tr12*di2+tr11*di3;
 
 1739      T tr5 = ti11*dr5 + ti12*dr4;
 
 1740      T ti5 = ti11*di5 + ti12*di4;
 
 1741      T tr4 = ti12*dr5 - ti11*dr4;
 
 1742      T ti4 = ti12*di5 - ti11*di4;
 
 1743      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
 
 1744      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2);
 
 1745      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
 
 1746      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3);
 
 1750#undef POCKETFFT_REARRANGE 
 1752template<
typename T> 
void radfg(
size_t ido, 
size_t ip, 
size_t l1,
 
 1756  const size_t cdim=ip;
 
 1757  size_t ipph=(ip+1)/2;
 
 1758  size_t idl1 = ido*l1;
 
 1760  auto CC = [cc,ido,cdim](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1761    { 
return cc[a+ido*(b+cdim*c)]; };
 
 1762  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1763    { 
return ch[a+ido*(b+l1*c)]; };
 
 1764  auto C1 = [cc,ido,l1] (
size_t a, 
size_t b, 
size_t c) -> T&
 
 1765    { 
return cc[a+ido*(b+l1*c)]; };
 
 1766  auto C2 = [cc,idl1] (
size_t a, 
size_t b) -> T&
 
 1767    { 
return cc[a+idl1*b]; };
 
 1768  auto CH2 = [ch,idl1] (
size_t a, 
size_t b) -> T&
 
 1769    { 
return ch[a+idl1*b]; };
 
 1773    for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              
 
 1775      size_t is=(j-1)*(ido-1),
 
 1777      for (
size_t k=0; k<l1; ++k)                            
 
 1781        for (
size_t i=1; i<=ido-2; i+=2)                      
 
 1783          T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
 
 1784            t3=C1(i,k,jc), t4=C1(i+1,k,jc);
 
 1785          T x1=wa[idij]*t1 + wa[idij+1]*t2,
 
 1786            x2=wa[idij]*t2 - wa[idij+1]*t1,
 
 1787            x3=wa[idij2]*t3 + wa[idij2+1]*t4,
 
 1788            x4=wa[idij2]*t4 - wa[idij2+1]*t3;
 
 1789          PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
 
 1790          PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
 
 1798  for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                
 
 1799    for (
size_t k=0; k<l1; ++k)                              
 
 1805  for (
size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 
 
 1807    for (
size_t ik=0; ik<idl1; ++ik)                         
 
 1809      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
 
 1810      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
 
 1813    size_t j=3, jc=ip-3;
 
 1814    for (; j<ipph-3; j+=4,jc-=4)              
 
 1816      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1817      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
 
 1818      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1819      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
 
 1820      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1821      T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
 
 1822      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1823      T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
 
 1824      for (
size_t ik=0; ik<idl1; ++ik)                       
 
 1826        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
 
 1827                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
 
 1828        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
 
 1829                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
 
 1832    for (; j<ipph-1; j+=2,jc-=2)              
 
 1834      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1835      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
 
 1836      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1837      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
 
 1838      for (
size_t ik=0; ik<idl1; ++ik)                       
 
 1840        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
 
 1841        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
 
 1844    for (; j<ipph; ++j,--jc)              
 
 1846      iang+=l; 
if (iang>=ip) iang-=ip;
 
 1847      T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
 
 1848      for (
size_t ik=0; ik<idl1; ++ik)                       
 
 1850        CH2(ik,l ) += ar*C2(ik,j );
 
 1851        CH2(ik,lc) += ai*C2(ik,jc);
 
 1855  for (
size_t ik=0; ik<idl1; ++ik)                         
 
 1856    CH2(ik,0) = C2(ik,0);
 
 1857  for (
size_t j=1; j<ipph; ++j)                              
 
 1858    for (
size_t ik=0; ik<idl1; ++ik)                         
 
 1859      CH2(ik,0) += C2(ik,j);
 
 1864  for (
size_t k=0; k<l1; ++k)                                
 
 1865    for (
size_t i=0; i<ido; ++i)                             
 
 1866      CC(i,0,k) = CH(i,k,0);
 
 1868  for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                
 
 1871    for (
size_t k=0; k<l1; ++k)                              
 
 1873      CC(ido-1,j2,k) = CH(0,k,j);
 
 1874      CC(0,j2+1,k) = CH(0,k,jc);
 
 1880  for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                
 
 1883    for(
size_t k=0; k<l1; ++k)                               
 
 1884      for(
size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      
 
 1886        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
 
 1887        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
 
 1888        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
 
 1889        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
 
 1894template<
typename T> 
void radb2(
size_t ido, 
size_t l1,
 
 1898  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 1899  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1900    { 
return cc[a+ido*(b+2*c)]; };
 
 1901  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1902    { 
return ch[a+ido*(b+l1*c)]; };
 
 1904  for (
size_t k=0; k<l1; k++)
 
 1905    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
 
 1907    for (
size_t k=0; k<l1; k++)
 
 1909      CH(ido-1,k,0) = 2*CC(ido-1,0,k);
 
 1910      CH(ido-1,k,1) =-2*CC(0    ,1,k);
 
 1913  for (
size_t k=0; k<l1;++k)
 
 1914    for (
size_t i=2; i<ido; i+=2)
 
 1918      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
 
 1919      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k));
 
 1920      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
 
 1924template<
typename T> 
void radb3(
size_t ido, 
size_t l1,
 
 1928  constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
 
 1930  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 1931  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1932    { 
return cc[a+ido*(b+3*c)]; };
 
 1933  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1934    { 
return ch[a+ido*(b+l1*c)]; };
 
 1936  for (
size_t k=0; k<l1; k++)
 
 1938    T tr2=2*CC(ido-1,1,k);
 
 1939    T cr2=CC(0,0,k)+taur*tr2;
 
 1940    CH(0,k,0)=CC(0,0,k)+tr2;
 
 1941    T ci3=2*taui*CC(0,2,k);
 
 1942    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
 
 1945  for (
size_t k=0; k<l1; k++)
 
 1946    for (
size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
 
 1948      T tr2=CC(i-1,2,k)+CC(ic-1,1,k); 
 
 1949      T ti2=CC(i  ,2,k)-CC(ic  ,1,k);
 
 1950      T cr2=CC(i-1,0,k)+taur*tr2;     
 
 1951      T ci2=CC(i  ,0,k)+taur*ti2;
 
 1952      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         
 
 1953      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
 
 1954      T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));
 
 1955      T ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
 
 1956      T di2, di3, dr2, dr3;
 
 1957      PM(dr3,dr2,cr2,ci3); 
 
 1958      PM(di2,di3,ci2,cr3); 
 
 1959      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); 
 
 1960      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
 
 1964template<
typename T> 
void radb4(
size_t ido, 
size_t l1,
 
 1968  constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
 
 1970  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 1971  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 1972    { 
return cc[a+ido*(b+4*c)]; };
 
 1973  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 1974    { 
return ch[a+ido*(b+l1*c)]; };
 
 1976  for (
size_t k=0; k<l1; k++)
 
 1979    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
 
 1980    T tr3=2*CC(ido-1,1,k);
 
 1982    PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
 
 1983    PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
 
 1986    for (
size_t k=0; k<l1; k++)
 
 1989      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k));
 
 1990      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
 
 1991      CH(ido-1,k,0)=tr2+tr2;
 
 1992      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
 
 1993      CH(ido-1,k,2)=ti2+ti2;
 
 1994      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
 
 1997  for (
size_t k=0; k<l1;++k)
 
 1998    for (
size_t i=2; i<ido; i+=2)
 
 2000      T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
 
 2002      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
 
 2003      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k));
 
 2004      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k));
 
 2005      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
 
 2006      PM (CH(i-1,k,0),cr3,tr2,tr3);
 
 2007      PM (CH(i  ,k,0),ci3,ti2,ti3);
 
 2008      PM (cr4,cr2,tr1,tr4);
 
 2009      PM (ci2,ci4,ti1,ti4);
 
 2010      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
 
 2011      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
 
 2012      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
 
 2016template<
typename T> 
void radb5(
size_t ido, 
size_t l1,
 
 2020  constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
 
 2021               ti11= T0(0.9510565162951535721164393333793821L),
 
 2022               tr12= T0(-0.8090169943749474241022934171828191L),
 
 2023               ti12= T0(0.5877852522924731291687059546390728L);
 
 2025  auto WA = [wa,ido](
size_t x, 
size_t i) { 
return wa[i+x*(ido-1)]; };
 
 2026  auto CC = [cc,ido](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 2027    { 
return cc[a+ido*(b+5*c)]; };
 
 2028  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 2029    { 
return ch[a+ido*(b+l1*c)]; };
 
 2031  for (
size_t k=0; k<l1; k++)
 
 2033    T ti5=CC(0,2,k)+CC(0,2,k);
 
 2034    T ti4=CC(0,4,k)+CC(0,4,k);
 
 2035    T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
 
 2036    T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
 
 2037    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
 
 2038    T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
 
 2039    T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
 
 2041    MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
 
 2042    PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
 
 2043    PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
 
 2046  for (
size_t k=0; k<l1;++k)
 
 2047    for (
size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
 
 2049      T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
 
 2050      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
 
 2051      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k));
 
 2052      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
 
 2053      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k));
 
 2054      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
 
 2055      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
 
 2056      T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
 
 2057      T ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
 
 2058      T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
 
 2059      T ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
 
 2060      T ci4, ci5, cr5, cr4;
 
 2061      MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
 
 2062      MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
 
 2063      T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
 
 2064      PM(dr4,dr3,cr3,ci4);
 
 2065      PM(di3,di4,ci3,cr4);
 
 2066      PM(dr5,dr2,cr2,ci5);
 
 2067      PM(di2,di5,ci2,cr5);
 
 2068      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
 
 2069      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
 
 2070      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
 
 2071      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
 
 2075template<
typename T> 
void radbg(
size_t ido, 
size_t ip, 
size_t l1,
 
 2079  const size_t cdim=ip;
 
 2080  size_t ipph=(ip+1)/ 2;
 
 2081  size_t idl1 = ido*l1;
 
 2083  auto CC = [cc,ido,cdim](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 2084    { 
return cc[a+ido*(b+cdim*c)]; };
 
 2085  auto CH = [ch,ido,l1](
size_t a, 
size_t b, 
size_t c) -> T&
 
 2086    { 
return ch[a+ido*(b+l1*c)]; };
 
 2087  auto C1 = [cc,ido,l1](
size_t a, 
size_t b, 
size_t c) -> 
const T&
 
 2088    { 
return cc[a+ido*(b+l1*c)]; };
 
 2089  auto C2 = [cc,idl1](
size_t a, 
size_t b) -> T&
 
 2090    { 
return cc[a+idl1*b]; };
 
 2091  auto CH2 = [ch,idl1](
size_t a, 
size_t b) -> T&
 
 2092    { 
return ch[a+idl1*b]; };
 
 2094  for (
size_t k=0; k<l1; ++k)        
 
 2095    for (
size_t i=0; i<ido; ++i)     
 
 2096      CH(i,k,0) = CC(i,0,k);
 
 2097  for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   
 
 2100    for (
size_t k=0; k<l1; ++k)
 
 2102      CH(0,k,j ) = 2*CC(ido-1,j2,k);
 
 2103      CH(0,k,jc) = 2*CC(0,j2+1,k);
 
 2109    for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   
 
 2112      for (
size_t k=0; k<l1; ++k)
 
 2113        for (
size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      
 
 2115          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
 
 2116          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
 
 2117          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
 
 2118          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
 
 2122  for (
size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
 
 2124    for (
size_t ik=0; ik<idl1; ++ik)
 
 2126      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
 
 2127      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
 
 2131    for(; j<ipph-3; j+=4,jc-=4)
 
 2133      iang+=l; 
if(iang>ip) iang-=ip;
 
 2134      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
 
 2135      iang+=l; 
if(iang>ip) iang-=ip;
 
 2136      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
 
 2137      iang+=l; 
if(iang>ip) iang-=ip;
 
 2138      T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
 
 2139      iang+=l; 
if(iang>ip) iang-=ip;
 
 2140      T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
 
 2141      for (
size_t ik=0; ik<idl1; ++ik)
 
 2143        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
 
 2144                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
 
 2145        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
 
 2146                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
 
 2149    for(; j<ipph-1; j+=2,jc-=2)
 
 2151      iang+=l; 
if(iang>ip) iang-=ip;
 
 2152      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
 
 2153      iang+=l; 
if(iang>ip) iang-=ip;
 
 2154      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
 
 2155      for (
size_t ik=0; ik<idl1; ++ik)
 
 2157        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
 
 2158        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
 
 2161    for(; j<ipph; ++j,--jc)
 
 2163      iang+=l; 
if(iang>ip) iang-=ip;
 
 2164      T0 war=csarr[2*iang], wai=csarr[2*iang+1];
 
 2165      for (
size_t ik=0; ik<idl1; ++ik)
 
 2167        C2(ik,l ) += war*CH2(ik,j );
 
 2168        C2(ik,lc) += wai*CH2(ik,jc);
 
 2172  for (
size_t j=1; j<ipph; ++j)
 
 2173    for (
size_t ik=0; ik<idl1; ++ik)
 
 2174      CH2(ik,0) += CH2(ik,j);
 
 2175  for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   
 
 2176    for (
size_t k=0; k<l1; ++k)
 
 2177      PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
 
 2181  for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  
 
 2182    for (
size_t k=0; k<l1; ++k)
 
 2183      for (
size_t i=1; i<=ido-2; i+=2)
 
 2185        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
 
 2186        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
 
 2187        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
 
 2188        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
 
 2193  for (
size_t j=1; j<ip; ++j)
 
 2195    size_t is = (j-1)*(ido-1);
 
 2196    for (
size_t k=0; k<l1; ++k)
 
 2199      for (
size_t i=1; i<=ido-2; i+=2)
 
 2201        T t1=CH(i,k,j), t2=CH(i+1,k,j);
 
 2202        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
 
 2203        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
 
 2210    template<
typename T> 
void copy_and_norm(T *c, T *p1, T0 fct)
 const 
 2215          for (
size_t i=0; i<length; ++i)
 
 2218          std::copy_n (p1, length, c);
 
 2222          for (
size_t i=0; i<length; ++i)
 
 2229      if (length==1) { c[0]*=fct; 
return; }
 
 2230      size_t nf=fact.size();
 
 2238          size_t ip=fact[
k].fct;
 
 2239          size_t ido=length / 
l1;
 
 2254        for(
size_t k=0, 
l1=1; 
k<
nf; 
k++)
 
 2256          size_t ip = fact[
k].fct,
 
 2272      copy_and_norm(c,
p1,fct);
 
 
 2280        { add_factor(4); len>>=2; }
 
 2286        std::swap(fact[0].fct, fact.back().fct);
 
 2288      for (
size_t divisor=3; divisor*divisor<=len; divisor+=2)
 
 2289        while ((len%divisor)==0)
 
 2291          add_factor(divisor);
 
 2294      if (len>1) add_factor(len);
 
 2297    size_t twsize()
 const 
 2299      size_t twsz=0, l1=1;
 
 2300      for (
size_t k=0; k<fact.size(); ++k)
 
 2302        size_t ip=fact[k].fct, ido=length/(l1*ip);
 
 2303        twsz+=(ip-1)*(ido-1);
 
 2304        if (ip>5) twsz+=2*ip;
 
 2312      sincos_2pibyn<T0> twid(length);
 
 2315      for (
size_t k=0; k<fact.size(); ++k)
 
 2317        size_t ip=fact[k].fct, ido=length/(l1*ip);
 
 2318        if (k<fact.size()-1) 
 
 2320          fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
 
 2321          for (
size_t j=1; j<ip; ++j)
 
 2322            for (
size_t i=1; i<=(ido-1)/2; ++i)
 
 2324              fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
 
 2325              fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
 
 2330          fact[k].tws=ptr; ptr+=2*ip;
 
 2331          fact[k].tws[0] = 1.;
 
 2332          fact[k].tws[1] = 0.;
 
 2333          for (
size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
 
 2335            fact[k].tws[i  ] = twid[i/2*(length/ip)].r;
 
 2336            fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
 
 2337            fact[k].tws[ic]   = twid[i/2*(length/ip)].r;
 
 2338            fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
 
 2349      if (length==0) 
throw std::runtime_error(
"zero-length FFT requested");
 
 2350      if (length==1) 
return;
 
 2352      mem.resize(twsize());