TriangularSolverMatrix.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
11 #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 // if the rhs is row major, let's transpose the product
18 template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
19 struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
20 {
21  static EIGEN_DONT_INLINE void run(
22  Index size, Index cols,
23  const Scalar* tri, Index triStride,
24  Scalar* _other, Index otherStride,
25  level3_blocking<Scalar,Scalar>& blocking)
26  {
27  triangular_solve_matrix<
28  Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
29  (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
30  NumTraits<Scalar>::IsComplex && Conjugate,
31  TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
32  ::run(size, cols, tri, triStride, _other, otherStride, blocking);
33  }
34 };
35 
36 /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
37  */
38 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
39 struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
40 {
41  static EIGEN_DONT_INLINE void run(
42  Index size, Index otherSize,
43  const Scalar* _tri, Index triStride,
44  Scalar* _other, Index otherStride,
45  level3_blocking<Scalar,Scalar>& blocking)
46  {
47  Index cols = otherSize;
48  const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
49  blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
50 
51  typedef gebp_traits<Scalar,Scalar> Traits;
52  enum {
53  SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
54  IsLower = (Mode&Lower) == Lower
55  };
56 
57  Index kc = blocking.kc(); // cache block size along the K direction
58  Index mc = (std::min)(size,blocking.mc()); // cache block size along the M direction
59 
60  std::size_t sizeA = kc*mc;
61  std::size_t sizeB = kc*cols;
62  std::size_t sizeW = kc*Traits::WorkSpaceFactor;
63 
64  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
65  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
66  ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
67 
68  conj_if<Conjugate> conj;
69  gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
70  gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
71  gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
72 
73  // the goal here is to subdivise the Rhs panels such that we keep some cache
74  // coherence when accessing the rhs elements
75  std::ptrdiff_t l1, l2;
76  manage_caching_sizes(GetAction, &l1, &l2);
77  Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
78  subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
79 
80  for(Index k2=IsLower ? 0 : size;
81  IsLower ? k2<size : k2>0;
82  IsLower ? k2+=kc : k2-=kc)
83  {
84  const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
85 
86  // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
87  // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
88  // A11 (the triangular part) and A21 the remaining rectangular part.
89  // Then the high level algorithm is:
90  // - B = R1 => general block copy (done during the next step)
91  // - R1 = A11^-1 B => tricky part
92  // - update B from the new R1 => actually this has to be performed continuously during the above step
93  // - R2 -= A21 * B => GEPP
94 
95  // The tricky part: compute R1 = A11^-1 B while updating B from R1
96  // The idea is to split A11 into multiple small vertical panels.
97  // Each panel can be split into a small triangular part T1k which is processed without optimization,
98  // and the remaining small part T2k which is processed using gebp with appropriate block strides
99  for(Index j2=0; j2<cols; j2+=subcols)
100  {
101  Index actual_cols = (std::min)(cols-j2,subcols);
102  // for each small vertical panels [T1k^T, T2k^T]^T of lhs
103  for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
104  {
105  Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
106  // tr solve
107  for (Index k=0; k<actualPanelWidth; ++k)
108  {
109  // TODO write a small kernel handling this (can be shared with trsv)
110  Index i = IsLower ? k2+k1+k : k2-k1-k-1;
111  Index s = IsLower ? k2+k1 : i+1;
112  Index rs = actualPanelWidth - k - 1; // remaining size
113 
114  Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
115  for (Index j=j2; j<j2+actual_cols; ++j)
116  {
117  if (TriStorageOrder==RowMajor)
118  {
119  Scalar b(0);
120  const Scalar* l = &tri(i,s);
121  Scalar* r = &other(s,j);
122  for (Index i3=0; i3<k; ++i3)
123  b += conj(l[i3]) * r[i3];
124 
125  other(i,j) = (other(i,j) - b)*a;
126  }
127  else
128  {
129  Index s = IsLower ? i+1 : i-rs;
130  Scalar b = (other(i,j) *= a);
131  Scalar* r = &other(s,j);
132  const Scalar* l = &tri(s,i);
133  for (Index i3=0;i3<rs;++i3)
134  r[i3] -= b * conj(l[i3]);
135  }
136  }
137  }
138 
139  Index lengthTarget = actual_kc-k1-actualPanelWidth;
140  Index startBlock = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
141  Index blockBOffset = IsLower ? k1 : lengthTarget;
142 
143  // update the respective rows of B from other
144  pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
145 
146  // GEBP
147  if (lengthTarget>0)
148  {
149  Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
150 
151  pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
152 
153  gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
154  actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
155  }
156  }
157  }
158 
159  // R2 -= A21 * B => GEPP
160  {
161  Index start = IsLower ? k2+kc : 0;
162  Index end = IsLower ? size : k2-kc;
163  for(Index i2=start; i2<end; i2+=mc)
164  {
165  const Index actual_mc = (std::min)(mc,end-i2);
166  if (actual_mc>0)
167  {
168  pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
169 
170  gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
171  }
172  }
173  }
174  }
175  }
176 };
177 
178 /* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
179  */
180 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
181 struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
182 {
183  static EIGEN_DONT_INLINE void run(
184  Index size, Index otherSize,
185  const Scalar* _tri, Index triStride,
186  Scalar* _other, Index otherStride,
187  level3_blocking<Scalar,Scalar>& blocking)
188  {
189  Index rows = otherSize;
190  const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
191  blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
192 
193  typedef gebp_traits<Scalar,Scalar> Traits;
194  enum {
195  RhsStorageOrder = TriStorageOrder,
196  SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
197  IsLower = (Mode&Lower) == Lower
198  };
199 
200  Index kc = blocking.kc(); // cache block size along the K direction
201  Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
202 
203  std::size_t sizeA = kc*mc;
204  std::size_t sizeB = kc*size;
205  std::size_t sizeW = kc*Traits::WorkSpaceFactor;
206 
207  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
208  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
209  ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
210 
211  conj_if<Conjugate> conj;
212  gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
213  gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
214  gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
215  gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
216 
217  for(Index k2=IsLower ? size : 0;
218  IsLower ? k2>0 : k2<size;
219  IsLower ? k2-=kc : k2+=kc)
220  {
221  const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
222  Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
223 
224  Index startPanel = IsLower ? 0 : k2+actual_kc;
225  Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
226  Scalar* geb = blockB+actual_kc*actual_kc;
227 
228  if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
229 
230  // triangular packing (we only pack the panels off the diagonal,
231  // neglecting the blocks overlapping the diagonal
232  {
233  for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
234  {
235  Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
236  Index actual_j2 = actual_k2 + j2;
237  Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
238  Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
239 
240  if (panelLength>0)
241  pack_rhs_panel(blockB+j2*actual_kc,
242  &rhs(actual_k2+panelOffset, actual_j2), triStride,
243  panelLength, actualPanelWidth,
244  actual_kc, panelOffset);
245  }
246  }
247 
248  for(Index i2=0; i2<rows; i2+=mc)
249  {
250  const Index actual_mc = (std::min)(mc,rows-i2);
251 
252  // triangular solver kernel
253  {
254  // for each small block of the diagonal (=> vertical panels of rhs)
255  for (Index j2 = IsLower
256  ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
257  : Index(SmallPanelWidth)))
258  : 0;
259  IsLower ? j2>=0 : j2<actual_kc;
260  IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
261  {
262  Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
263  Index absolute_j2 = actual_k2 + j2;
264  Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
265  Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
266 
267  // GEBP
268  if(panelLength>0)
269  {
270  gebp_kernel(&lhs(i2,absolute_j2), otherStride,
271  blockA, blockB+j2*actual_kc,
272  actual_mc, panelLength, actualPanelWidth,
273  Scalar(-1),
274  actual_kc, actual_kc, // strides
275  panelOffset, panelOffset, // offsets
276  blockW); // workspace
277  }
278 
279  // unblocked triangular solve
280  for (Index k=0; k<actualPanelWidth; ++k)
281  {
282  Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
283 
284  Scalar* r = &lhs(i2,j);
285  for (Index k3=0; k3<k; ++k3)
286  {
287  Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
288  Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
289  for (Index i=0; i<actual_mc; ++i)
290  r[i] -= a[i] * b;
291  }
292  Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
293  for (Index i=0; i<actual_mc; ++i)
294  r[i] *= b;
295  }
296 
297  // pack the just computed part of lhs to A
298  pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
299  actualPanelWidth, actual_mc,
300  actual_kc, j2);
301  }
302  }
303 
304  if (rs>0)
305  gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
306  actual_mc, actual_kc, rs, Scalar(-1),
307  -1, -1, 0, 0, blockW);
308  }
309  }
310  }
311 };
312 
313 } // end namespace internal
314 
315 } // end namespace Eigen
316 
317 #endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H