1.6 rocSOLVER API
1.6.3 LAPACK Functions
1.6.3.2 General Matrix Factorizations
rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int *info)
rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int *info)
rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int
*info)
rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int *info) GETF2 computes the LU factorization of a general m-by-n matrix A using partial pivoting with row inter-changes.
(This is the unblocked Level-2-BLAS version of the algorithm. An optimized internal implementation without rocBLAS calls could be executed with small and mid-size matrices if optimizations are enabled (default option).
For more details see the section “tuning rocSOLVER performance” on the User’s guide).
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of the matrix A.
• [in] n: rocblas_int. n >= 0. The number of columns of the matrix A.
• [inout] A: pointer to type. Array on the GPU of dimension lda*n. On entry, the m-by-n matrix A to be factored. On exit, the factors L and U from the factorization. The unit diagonal elements of L are not stored.
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of A.
• [out] ipiv: pointer to rocblas_int. Array on the GPU of dimension min(m,n). The vector of pivot indices. Elements of ipiv are 1-based indices. For 1 <= i <= min(m,n), the row i of the matrix was interchanged with row ipiv[i]. Matrix P of the factorization can be derived from ipiv.
• [out] info: pointer to a rocblas_int on the GPU. If info = 0, successful exit. If info = i > 0, U is singular. U(i,i) is the first zero pivot.
rocsolver_<type>getf2_batched()
rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count) rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const
rocblas_int n, float *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
GETF2_BATCHED computes the LU factorization of a batch of general m-by-n matrices using partial pivoting with row interchanges.
(This is the unblocked Level-2-BLAS version of the algorithm. An optimized internal implementation without rocBLAS calls could be executed with small and mid-size matrices if optimizations are enabled (default option).
For more details see the section “tuning rocSOLVER performance” on the User’s guide).
The factorization of matrix A_i in the batch has the form
A_i = P_i * L_i * U_i
where P_i is a permutation matrix, L_i is lower triangular with unit diagonal elements (lower trapezoidal if m >
n), and U_i is upper triangular (upper trapezoidal if m < n).
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of all matrices A_i in the batch.
• [in] n: rocblas_int. n >= 0. The number of columns of all matrices A_i in the batch.
• [inout] A: array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n. On entry, the m-by-n matrices A_i to be factored. On exit, the factors L_i and U_i from the factorizations. The unit diagonal elements of L_i are not stored.
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of matrices A_i.
• [out] ipiv: pointer to rocblas_int. Array on the GPU (the size depends on the value of strideP).
Contains the vectors of pivot indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n).
Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the matrix A_i was interchanged with row ipiv_i[j]. Matrix P_i of the factorization can be derived from ipiv_i.
• [in] strideP: rocblas_stride. Stride from the start of one vector ipiv_i to the next one ipiv_(i+1).
There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
• [out] info: pointer to rocblas_int. Array of batch_count integers on the GPU. If info_i = 0, successful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
• [in] batch_count: rocblas_int. batch_count >= 0. Number of matrices in the batch.
rocsolver_<type>getf2_strided_batched()
rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex
*A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int
*info, const rocblas_int batch_count) rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int
m, const rocblas_int n, double *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
GETF2_STRIDED_BATCHED computes the LU factorization of a batch of general m-by-n matrices using partial pivoting with row interchanges.
(This is the unblocked Level-2-BLAS version of the algorithm. An optimized internal implementation without rocBLAS calls could be executed with small and mid-size matrices if optimizations are enabled (default option).
For more details see the section “tuning rocSOLVER performance” on the User’s guide).
The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i
where P_i is a permutation matrix, L_i is lower triangular with unit diagonal elements (lower trapezoidal if m >
n), and U_i is upper triangular (upper trapezoidal if m < n).
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of all matrices A_i in the batch.
• [in] n: rocblas_int. n >= 0. The number of columns of all matrices A_i in the batch.
• [inout] A: pointer to type. Array on the GPU (the size depends on the value of strideA). On entry, the m-by-n matrices A_i to be factored. On exit, the factors L_i and U_i from the factorization. The unit diagonal elements of L_i are not stored.
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of matrices A_i.
• [in] strideA: rocblas_stride. Stride from the start of one matrix A_i and the next one A_(i+1).
There is no restriction for the value of strideA. Normal use case is strideA >= lda*n
• [out] ipiv: pointer to rocblas_int. Array on the GPU (the size depends on the value of strideP).
Contains the vectors of pivots indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n).
Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the matrix A_i was interchanged with row ipiv_i[j]. Matrix P_i of the factorization can be derived from ipiv_i.
• [in] strideP: rocblas_stride. Stride from the start of one vector ipiv_i to the next one ipiv_(i+1).
There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
• [out] info: pointer to rocblas_int. Array of batch_count integers on the GPU. If info_i = 0, successful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
• [in] batch_count: rocblas_int. batch_count >= 0. Number of matrices in the batch.
rocsolver_<type>getrf()
rocblas_status rocsolver_zgetrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int *info)
rocblas_status rocsolver_cgetrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int *info)
rocblas_status rocsolver_dgetrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int
*info)
rocblas_status rocsolver_sgetrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, const rocblas_int lda, rocblas_int *ipiv, rocblas_int *info) GETRF computes the LU factorization of a general m-by-n matrix A using partial pivoting with row inter-changes.
(This is the blocked Level-3-BLAS version of the algorithm. An optimized internal implementation without rocBLAS calls could be executed with mid-size matrices if optimizations are enabled (default option). For more details see the section “tuning rocSOLVER performance” on the User’s guide).
The factorization has the form A = P * L * U
where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of the matrix A.
• [in] n: rocblas_int. n >= 0. The number of columns of the matrix A.
• [inout] A: pointer to type. Array on the GPU of dimension lda*n. On entry, the m-by-n matrix A to be factored. On exit, the factors L and U from the factorization. The unit diagonal elements of L are not stored.
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of A.
• [out] ipiv: pointer to rocblas_int. Array on the GPU of dimension min(m,n). The vector of pivot indices. Elements of ipiv are 1-based indices. For 1 <= i <= min(m,n), the row i of the matrix was interchanged with row ipiv[i]. Matrix P of the factorization can be derived from ipiv.
• [out] info: pointer to a rocblas_int on the GPU. If info = 0, successful exit. If info = i > 0, U is singular. U(i,i) is the first zero pivot.
rocsolver_<type>getrf_batched()
rocblas_status rocsolver_zgetrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_cgetrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_dgetrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count) rocblas_status rocsolver_sgetrf_batched(rocblas_handle handle, const rocblas_int m, const
rocblas_int n, float *const A[], const rocblas_int lda, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
GETRF_BATCHED computes the LU factorization of a batch of general m-by-n matrices using partial pivoting with row interchanges.
(This is the blocked Level-3-BLAS version of the algorithm. An optimized internal implementation without rocBLAS calls could be executed with mid-size matrices if optimizations are enabled (default option). For more details see the section “tuning rocSOLVER performance” on the User’s guide).
The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i
where P_i is a permutation matrix, L_i is lower triangular with unit diagonal elements (lower trapezoidal if m >
n), and U_i is upper triangular (upper trapezoidal if m < n).
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of all matrices A_i in the batch.
• [in] n: rocblas_int. n >= 0. The number of columns of all matrices A_i in the batch.
• [inout] A: array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n. On entry, the m-by-n matrices A_i to be factored. On exit, the factors L_i and U_i from the factorizations. The unit diagonal elements of L_i are not stored.
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of matrices A_i.
• [out] ipiv: pointer to rocblas_int. Array on the GPU (the size depends on the value of strideP).
Contains the vectors of pivot indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n).
Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the matrix A_i was interchanged with row ipiv_i(j). Matrix P_i of the factorization can be derived from ipiv_i.
• [in] strideP: rocblas_stride. Stride from the start of one vector ipiv_i to the next one ipiv_(i+1).
There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
• [out] info: pointer to rocblas_int. Array of batch_count integers on the GPU. If info_i = 0, successful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
• [in] batch_count: rocblas_int. batch_count >= 0. Number of matrices in the batch.
rocsolver_<type>getrf_strided_batched()
rocblas_status rocsolver_zgetrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_cgetrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex
*A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int
*info, const rocblas_int batch_count) rocblas_status rocsolver_dgetrf_strided_batched(rocblas_handle handle, const rocblas_int
m, const rocblas_int n, double *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
rocblas_status rocsolver_sgetrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_int *ipiv, const rocblas_stride strideP, rocblas_int *info, const rocblas_int batch_count)
GETRF_STRIDED_BATCHED computes the LU factorization of a batch of general m-by-n matrices using partial pivoting with row interchanges.
(This is the blocked Level-3-BLAS version of the algorithm. An optimized internal implementation without rocBLAS calls could be executed with mid-size matrices if optimizations are enabled (default option). For more details see the section “tuning rocSOLVER performance” on the User’s guide).
The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i
where P_i is a permutation matrix, L_i is lower triangular with unit diagonal elements (lower trapezoidal if m >
n), and U_i is upper triangular (upper trapezoidal if m < n).
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of all matrices A_i in the batch.
• [in] n: rocblas_int. n >= 0. The number of columns of all matrices A_i in the batch.
• [inout] A: pointer to type. Array on the GPU (the size depends on the value of strideA). On entry, the m-by-n matrices A_i to be factored. On exit, the factors L_i and U_i from the factorization. The unit diagonal elements of L_i are not stored.
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of matrices A_i.
• [in] strideA: rocblas_stride. Stride from the start of one matrix A_i and the next one A_(i+1).
There is no restriction for the value of strideA. Normal use case is strideA >= lda*n
• [out] ipiv: pointer to rocblas_int. Array on the GPU (the size depends on the value of strideP).
Contains the vectors of pivots indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n).
Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the matrix A_i was interchanged with row ipiv_i(j). Matrix P_i of the factorization can be derived from ipiv_i.
• [in] strideP: rocblas_stride. Stride from the start of one vector ipiv_i to the next one ipiv_(i+1).
There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
• [out] info: pointer to rocblas_int. Array of batch_count integers on the GPU. If info_i = 0, successful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
• [in] batch_count: rocblas_int. batch_count >= 0. Number of matrices in the batch.
rocsolver_<type>geqr2()
rocblas_status rocsolver_zgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, const rocblas_int lda, rocblas_double_complex *ipiv)
rocblas_status rocsolver_cgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, const rocblas_int lda, rocblas_float_complex *ipiv)
rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, const rocblas_int lda, double *ipiv)
rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, const rocblas_int lda, float *ipiv)
GEQR2 computes a QR factorization of a general m-by-n matrix A.
(This is the unblocked version of the algorithm).
The factorization has the form A = Q * [ R ]
[ 0 ]
where R is upper triangular (upper trapezoidal if m < n), and Q is a m-by-m orthogonal/unitary matrix repre-sented as the product of Householder matrices
Q = H(1) * H(2) * ... * H(k), with k = min(m,n)
Each Householder matrix H(i), for i = 1,2,. . . ,k, is given by H(i) = I - ipiv[i-1] * v(i) * v(i)'
where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of the matrix A.
• [in] n: rocblas_int. n >= 0. The number of columns of the matrix A.
• [inout] A: pointer to type. Array on the GPU of dimension lda*n. On entry, the m-by-n matrix to be factored. On exit, the elements on and above the diagonal contain the factor R; the elements below the diagonal are the m - i elements of vector v(i) for i = 1,2,. . . ,min(m,n).
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of A.
• [out] ipiv: pointer to type. Array on the GPU of dimension min(m,n). The scalar factors of the Householder matrices H(i).
rocsolver_<type>geqr2_batched()
rocblas_status rocsolver_zgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[], const rocblas_int lda, rocblas_double_complex *ipiv, const rocblas_stride strideP, const rocblas_int batch_count)
rocblas_status rocsolver_cgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[], const rocblas_int lda, rocblas_float_complex *ipiv, const rocblas_stride strideP, const rocblas_int batch_count) rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const
rocblas_int n, double *const A[], const rocblas_int lda, double *ipiv, const rocblas_stride strideP, const rocblas_int batch_count)
rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], const rocblas_int lda, float *ipiv, const rocblas_stride strideP, const rocblas_int batch_count)
GEQR2_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
(This is the unblocked version of the algorithm).
The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ]
[ 0 ]
where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal/unitary matrix represented as the product of Householder matrices
Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
Each Householder matrices H_j(i), for j = 1,2,. . . ,batch_count, and i = 1,2,. . . ,k, is given by H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
Parameters
• [in] handle: rocblas_handle.
• [in] m: rocblas_int. m >= 0. The number of rows of all the matrices A_j in the batch.
• [in] n: rocblas_int. n >= 0. The number of columns of all the matrices A_j in the batch.
• [inout] A: Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n. On entry, the m-by-n matrices A_j to be factored. On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i=1,2,. . . ,min(m,n).
• [in] lda: rocblas_int. lda >= m. Specifies the leading dimension of matrices A_j.
• [out] ipiv: pointer to type. Array on the GPU (the size depends on the value of strideP). Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i).
• [in] strideP: rocblas_stride. Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
There is no restriction for the value of strideP. Normal use is strideP >= min(m,n).
• [in] batch_count: rocblas_int. batch_count >= 0. Number of matrices in the batch.
rocsolver_<type>geqr2_strided_batched()
rocblas_status rocsolver_zgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, const rocblas_int lda, const rocblas_stride strideA, rocblas_double_complex *ipiv, const rocblas_stride strideP, const rocblas_int batch_count)
rocblas_status rocsolver_cgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex
*A, const rocblas_int lda, const rocblas_stride strideA, rocblas_float_complex
*ipiv, const rocblas_stride strideP, const rocblas_int batch_count)
rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, const rocblas_int lda, const rocblas_stride strideA, double *ipiv, const rocblas_stride strideP, constrocblas_int batch_count)
rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int
rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int