Reading Profile files in profile.* NODE 0;CONTEXT 0;THREAD 0: --------------------------------------------------------------------------------------- %Time Exclusive Inclusive #Call #Subrs Inclusive Name msec total msec usec/call --------------------------------------------------------------------------------------- 100.0 14 7,839 1 1 7839918 .TAU application 99.8 7,825 7,825 1 1 7825745 int taupreload_main(int, char **, char **) 0.0 0.008 0.008 1 0 8 pthread_join NODE 0;CONTEXT 0;THREAD 1: --------------------------------------------------------------------------------------- %Time Exclusive Inclusive #Call #Subrs Inclusive Name msec total msec usec/call --------------------------------------------------------------------------------------- 100.0 1,557 7,051 1 24106 7051088 .TAU application 72.0 5,073 5,073 6 0 845627 void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 2.7 191 191 1992 0 96 void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2.0 142 142 2789 0 51 void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 0.4 24 24 3810 0 7 void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 0.2 15 15 4331 0 4 void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 0.1 10 10 2256 0 5 void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 0.1 5 5 1782 0 3 void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 0.1 5 5 1794 0 3 void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 0.1 5 5 1794 0 3 void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 0.1 5 5 1182 0 4 void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 0.1 4 4 600 0 8 void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 0.1 3 3 1050 0 4 void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 0.0 2 2 600 0 5 void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 0.0 1 1 6 0 195 Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 0.0 0.406 0.406 12 0 34 void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 0.0 0.109 0.109 12 0 9 Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 0.0 0.0945 0.0945 12 0 8 Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 0.0 0.064 0.064 12 0 5 void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 0.0 0.0553 0.0553 18 0 3 void restau(int, double*, long) [clone .kd] 0.0 0.0503 0.0503 12 0 4 void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 0.0 0.0385 0.0385 18 0 2 void reset_batch_info(double*, long, int, int) [clone .kd] 0.0 0.0278 0.0278 6 0 5 void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 0.0 0.0238 0.0238 6 0 4 void iota_n(double*, unsigned int, double) [clone .kd] 0.0 0.0195 0.0195 6 0 3 void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] --------------------------------------------------------------------------------------- USER EVENTS Profile :NODE 0, CONTEXT 0, THREAD 1 --------------------------------------------------------------------------------------- NumSamples MaxValue MinValue MeanValue Std. Dev. Event Name --------------------------------------------------------------------------------------- 12 1600 576 1088 512 Grid Size : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 2.002E+05 2.002E+05 2.002E+05 0 Grid Size : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 1920 768 1344 576 Grid Size : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 1 1 1 0 Grid Size : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 1.638E+04 1.638E+04 1.638E+04 0 Grid Size : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 1.638E+04 1.638E+04 1.638E+04 0 Grid Size : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 1024 1024 1024 0 Grid Size : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 2.534E+04 256 1.194E+04 7410 Grid Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 2.534E+04 256 1.067E+04 7508 Grid Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 6.4E+06 1024 1.845E+06 2.361E+06 Grid Size : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 32 32 32 0 Grid Size : void iota_n(double*, unsigned int, double) [clone .kd] 12 1.28E+07 1.638E+04 6.408E+06 6.392E+06 Grid Size : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 1.638E+04 1.638E+04 1.638E+04 0 Grid Size : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 1 1 1 0 Grid Size : void reset_batch_info(double*, long, int, int) [clone .kd] 18 128 128 128 0 Grid Size : void restau(int, double*, long) [clone .kd] 1050 64 1 2.44 9.415 Grid Size : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 5.018E+04 5.018E+04 5.018E+04 0 Grid Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 1024 1024 1024 0 Grid Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 512 512 512 0 Grid Size : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 1.001E+05 256 2.791E+04 4.468E+04 Grid Size : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 1 1 1 0 Grid Size : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 1 1 1 0 Grid Size : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 1.638E+04 1.638E+04 1.638E+04 0 Grid Size : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 1 1 1 0 Grid Size : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 12 2048 2048 2048 0 LDS Memory Size : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 1.638E+04 1.638E+04 1.638E+04 0 LDS Memory Size : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 1.434E+04 1.434E+04 1.434E+04 0 LDS Memory Size : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 0 0 0 0 LDS Memory Size : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 0 0 0 0 LDS Memory Size : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 0 0 0 0 LDS Memory Size : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 3.277E+04 3.277E+04 3.277E+04 0 LDS Memory Size : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 2048 2048 2048 0 LDS Memory Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 2048 2048 2048 0 LDS Memory Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 1024 1024 1024 0 LDS Memory Size : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 0 0 0 0 LDS Memory Size : void iota_n(double*, unsigned int, double) [clone .kd] 12 0 0 0 0 LDS Memory Size : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 0 0 0 0 LDS Memory Size : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 0 0 0 0 LDS Memory Size : void reset_batch_info(double*, long, int, int) [clone .kd] 18 0 0 0 0 LDS Memory Size : void restau(int, double*, long) [clone .kd] 1050 0 0 0 0 LDS Memory Size : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 512 512 512 0 LDS Memory Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 512 512 512 0 LDS Memory Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 512 512 512 0 LDS Memory Size : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 0 0 0 0 LDS Memory Size : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 0 0 0 0 LDS Memory Size : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 0 0 0 0 LDS Memory Size : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 0 0 0 0 LDS Memory Size : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 0 0 0 0 LDS Memory Size : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 12 96 96 96 0 Scalar Register Size (SGPR) : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 88 88 88 0 Scalar Register Size (SGPR) : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 88 88 88 0 Scalar Register Size (SGPR) : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 112 112 112 0 Scalar Register Size (SGPR) : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 40 40 40 0 Scalar Register Size (SGPR) : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 32 32 32 0 Scalar Register Size (SGPR) : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 56 56 56 0 Scalar Register Size (SGPR) : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 48 48 48 0 Scalar Register Size (SGPR) : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 48 48 48 0 Scalar Register Size (SGPR) : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 40 40 40 0 Scalar Register Size (SGPR) : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 16 16 16 0 Scalar Register Size (SGPR) : void iota_n(double*, unsigned int, double) [clone .kd] 12 24 24 24 0 Scalar Register Size (SGPR) : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 24 24 24 0 Scalar Register Size (SGPR) : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 24 24 24 0 Scalar Register Size (SGPR) : void reset_batch_info(double*, long, int, int) [clone .kd] 18 24 24 24 0 Scalar Register Size (SGPR) : void restau(int, double*, long) [clone .kd] 1050 32 32 32 0 Scalar Register Size (SGPR) : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 32 32 32 0 Scalar Register Size (SGPR) : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 32 32 32 0 Scalar Register Size (SGPR) : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 32 32 32 0 Scalar Register Size (SGPR) : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 32 32 32 0 Scalar Register Size (SGPR) : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 32 32 32 0 Scalar Register Size (SGPR) : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 32 32 32 0 Scalar Register Size (SGPR) : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 24 24 24 0 Scalar Register Size (SGPR) : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 24 24 24 0 Scalar Register Size (SGPR) : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 12 0 0 0 0 Scratch Memory Size : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 0 0 0 0 Scratch Memory Size : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 0 0 0 0 Scratch Memory Size : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 0 0 0 0 Scratch Memory Size : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 0 0 0 0 Scratch Memory Size : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 0 0 0 0 Scratch Memory Size : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 0 0 0 0 Scratch Memory Size : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 0 0 0 0 Scratch Memory Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 0 0 0 0 Scratch Memory Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 0 0 0 0 Scratch Memory Size : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 0 0 0 0 Scratch Memory Size : void iota_n(double*, unsigned int, double) [clone .kd] 12 0 0 0 0 Scratch Memory Size : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 0 0 0 0 Scratch Memory Size : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 0 0 0 0 Scratch Memory Size : void reset_batch_info(double*, long, int, int) [clone .kd] 18 0 0 0 0 Scratch Memory Size : void restau(int, double*, long) [clone .kd] 1050 0 0 0 0 Scratch Memory Size : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 0 0 0 0 Scratch Memory Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 0 0 0 0 Scratch Memory Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 0 0 0 0 Scratch Memory Size : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 0 0 0 0 Scratch Memory Size : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 0 0 0 0 Scratch Memory Size : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 0 0 0 0 Scratch Memory Size : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 0 0 0 0 Scratch Memory Size : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 0 0 0 0 Scratch Memory Size : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 12 24 24 24 0 Vector Register Size (VGPR) : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 128 128 128 0 Vector Register Size (VGPR) : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 64 64 64 0 Vector Register Size (VGPR) : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 16 16 16 0 Vector Register Size (VGPR) : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 4 4 4 0 Vector Register Size (VGPR) : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 8 8 8 0 Vector Register Size (VGPR) : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 36 36 36 0 Vector Register Size (VGPR) : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 8 8 8 0 Vector Register Size (VGPR) : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 8 8 8 0 Vector Register Size (VGPR) : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 8 8 8 0 Vector Register Size (VGPR) : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 4 4 4 0 Vector Register Size (VGPR) : void iota_n(double*, unsigned int, double) [clone .kd] 12 4 4 4 0 Vector Register Size (VGPR) : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 4 4 4 0 Vector Register Size (VGPR) : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 4 4 4 0 Vector Register Size (VGPR) : void reset_batch_info(double*, long, int, int) [clone .kd] 18 4 4 4 0 Vector Register Size (VGPR) : void restau(int, double*, long) [clone .kd] 1050 4 4 4 0 Vector Register Size (VGPR) : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 8 8 8 0 Vector Register Size (VGPR) : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 8 8 8 0 Vector Register Size (VGPR) : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 8 8 8 0 Vector Register Size (VGPR) : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 4 4 4 0 Vector Register Size (VGPR) : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 8 8 8 0 Vector Register Size (VGPR) : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 8 8 8 0 Vector Register Size (VGPR) : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 4 4 4 0 Vector Register Size (VGPR) : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 4 4 4 0 Vector Register Size (VGPR) : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 12 64 64 64 0 Work Group Size : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 256 256 256 0 Work Group Size : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 128 128 128 0 Work Group Size : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 1 1 1 0 Work Group Size : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 1024 1024 1024 0 Work Group Size : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 1024 1024 1024 0 Work Group Size : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 1024 1024 1024 0 Work Group Size : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 256 256 256 0 Work Group Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 256 256 256 0 Work Group Size : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 1024 1024 1024 0 Work Group Size : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 32 32 32 0 Work Group Size : void iota_n(double*, unsigned int, double) [clone .kd] 12 1024 1024 1024 0 Work Group Size : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 1024 1024 1024 0 Work Group Size : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 1 1 1 0 Work Group Size : void reset_batch_info(double*, long, int, int) [clone .kd] 18 128 128 128 0 Work Group Size : void restau(int, double*, long) [clone .kd] 1050 64 1 2.44 9.415 Work Group Size : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 512 512 512 0 Work Group Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 1024 1024 1024 0 Work Group Size : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 512 512 512 0 Work Group Size : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 256 256 256 0 Work Group Size : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 1 1 1 0 Work Group Size : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 1 1 1 0 Work Group Size : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 1024 1024 1024 0 Work Group Size : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 1 1 1 0 Work Group Size : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 12 2.593E+06 2.593E+06 2.593E+06 0 fbarrier count : Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 6 6.379E+05 6.379E+05 6.379E+05 0 fbarrier count : Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 12 1.551E+06 1.551E+06 1.551E+06 0 fbarrier count : Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 6 3.053E+04 3.053E+04 3.053E+04 0 fbarrier count : void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 12 3.456E+04 3.456E+04 3.456E+04 0 fbarrier count : void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 12 1.978E+04 1.978E+04 1.978E+04 0 fbarrier count : void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 3810 1.445E+05 1.445E+05 1.445E+05 0 fbarrier count : void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1992 1.864E+05 1.864E+05 1.864E+05 0 fbarrier count : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2256 2.172E+05 2.172E+05 2.172E+05 0 fbarrier count : void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 2790 1.28E+04 1.28E+04 1.28E+04 0 fbarrier count : void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 6 2240 2240 2240 0 fbarrier count : void iota_n(double*, unsigned int, double) [clone .kd] 12 6080 6080 6080 0 fbarrier count : void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 6 7104 7104 7104 0 fbarrier count : void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 18 3072 3072 3072 0 fbarrier count : void reset_batch_info(double*, long, int, int) [clone .kd] 18 6464 6464 6464 0 fbarrier count : void restau(int, double*, long) [clone .kd] 1050 5760 5760 5760 0 fbarrier count : void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 600 8.531E+04 8.531E+04 8.531E+04 0 fbarrier count : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 1182 8.141E+04 8.141E+04 8.141E+04 0 fbarrier count : void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 600 7.827E+04 7.827E+04 7.827E+04 0 fbarrier count : void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 4332 1.325E+04 1.325E+04 1.325E+04 0 fbarrier count : void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 1794 5312 5312 5312 0 fbarrier count : void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 1782 3776 3776 3776 0 fbarrier count : void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 6 1.267E+04 1.267E+04 1.267E+04 0 fbarrier count : void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 1794 6272 6272 6272 0 fbarrier count : void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] --------------------------------------------------------------------------------------- FUNCTION SUMMARY (total): --------------------------------------------------------------------------------------- %Time Exclusive Inclusive #Call #Subrs Inclusive Name msec total msec usec/call --------------------------------------------------------------------------------------- 100.0 1,571 14,891 2 24107 7445503 .TAU application 52.6 7,825 7,825 1 1 7825745 int taupreload_main(int, char **, char **) 34.1 5,073 5,073 6 0 845627 void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 1.3 191 191 1992 0 96 void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1.0 142 142 2789 0 51 void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 0.2 24 24 3810 0 7 void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 0.1 15 15 4331 0 4 void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 0.1 10 10 2256 0 5 void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 0.0 5 5 1782 0 3 void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 0.0 5 5 1794 0 3 void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 0.0 5 5 1794 0 3 void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 0.0 5 5 1182 0 4 void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 0.0 4 4 600 0 8 void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 0.0 3 3 1050 0 4 void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 0.0 2 2 600 0 5 void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 0.0 1 1 6 0 195 Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 0.0 0.406 0.406 12 0 34 void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 0.0 0.109 0.109 12 0 9 Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 0.0 0.0945 0.0945 12 0 8 Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 0.0 0.064 0.064 12 0 5 void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 0.0 0.0553 0.0553 18 0 3 void restau(int, double*, long) [clone .kd] 0.0 0.0503 0.0503 12 0 4 void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 0.0 0.0385 0.0385 18 0 2 void reset_batch_info(double*, long, int, int) [clone .kd] 0.0 0.0278 0.0278 6 0 5 void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 0.0 0.0238 0.0238 6 0 4 void iota_n(double*, unsigned int, double) [clone .kd] 0.0 0.0195 0.0195 6 0 3 void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 0.0 0.008 0.008 1 0 8 pthread_join FUNCTION SUMMARY (mean): --------------------------------------------------------------------------------------- %Time Exclusive Inclusive #Call #Subrs Inclusive Name msec total msec usec/call --------------------------------------------------------------------------------------- 100.0 785 7,445 1 12053.5 7445503 .TAU application 52.6 3,912 3,912 0.5 0.5 7825745 int taupreload_main(int, char **, char **) 34.1 2,536 2,536 3 0 845627 void bdsqrKernel(int, int, int, int, double*, long, double*, long, double*, int, int, long, double*, int, int, long, double*, int, int, long, int*, int, double, double, double, double, double*, long) [clone .kd] 1.3 95 95 996 0 96 void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 1.0 71 71 1394.5 0 51 void ger_kernel<32, 32, 2, false, double, double const*, double const*, double*>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double*, long, int, long) [clone .kd] 0.2 12 12 1905 0 7 void gemvn_kernel<64, 16, int, double, double const*, double, double>(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 0.1 7 7 2165.5 0 4 void rocblas_scal_kernel<256, double, double const*, double*>(int, double const*, long, double*, long, int, long) [clone .kd] 0.1 5 5 1128 0 5 void gemvt_kernel(int, int, double const*, long, double const*, long, int, long, double const*, long, int, long, double const*, long, double*, long, int, long) [clone .kd] 0.0 2 2 891 0 3 void set_taubeta(double*, long, double*, double*, int, long) [clone .kd] 0.0 2 2 897 0 3 void subtract_tau(int, int, double*, int, int, long, double*, long) [clone .kd] 0.0 2 2 897 0 3 void set_diag(double*, int, long, double*, int, int, long, int, bool) [clone .kd] 0.0 2 2 591 0 4 void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 0.0 2 2 300 0 8 void rocblas_dot_kernel_magsq(int, double const*, long, int, long, double*, double*) [clone .kd] 0.0 1 1 525 0 4 void restore_diag(double*, int, long, double*, int, int, long, int) [clone .kd] 0.0 1 1 300 0 5 void rocblas_dot_kernel_reduce<512, 2, double, double>(int, double*, double*) [clone .kd] 0.0 0.586 0.586 3 0 195 Cijk_Ailk_Bljk_DB_MT128x128x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW2_SNLL0_TT4_64_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM11.kd 0.0 0.203 0.203 6 0 34 void org2r_init_ident(int, int, int, double*, int, int, long) [clone .kd] 0.0 0.0545 0.0545 6 0 9 Cijk_Ailk_Bjlk_DB_MT16x16x8_MI16x16x4x1_SN_1LDSB1_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS1_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS0_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM5.kd 0.0 0.0473 0.0473 6 0 8 Cijk_Ailk_Bljk_DB_MT32x16x16_MI16x16x4x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS0_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW1_GSU1_GSUAMB_GLS1_ISA90a_IU2_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW1_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR3_RK0_SIA3_SS1_SU0_SUM0_SUS0_SCIUI1_SPO1_SRVW0_SSO6_SVW1_SNLL0_TT1_16_TLDS1_USFGROn1_VAW1_VSn1_VW1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM11.kd 0.0 0.032 0.032 6 0 5 void copy_mat(int, int, double*, int, int, long, double*, int, int, long, no_mask, rocblas_fill_, rocblas_diagonal_) [clone .kd] 0.0 0.0276 0.0276 9 0 3 void restau(int, double*, long) [clone .kd] 0.0 0.0251 0.0251 6 0 4 void copyshift_down(bool, int, double*, int, int, long, double*, int, int, long) [clone .kd] 0.0 0.0192 0.0192 9 0 2 void reset_batch_info(double*, long, int, int) [clone .kd] 0.0 0.0139 0.0139 3 0 5 void set_zero(int, int, double*, int, int, long, rocblas_fill_) [clone .kd] 0.0 0.0119 0.0119 3 0 4 void iota_n(double*, unsigned int, double) [clone .kd] 0.0 0.00975 0.00975 3 0 3 void orgl2_init_ident(int, int, int, double*, int, int, long) [clone .kd] 0.0 0.004 0.004 0.5 0 8 pthread_join