|-> 8.47 - 100.00% [1] {min=8.47, max=8.47, mean=8.47, var=0.00, std dev=0.00} APEX MAIN | |-> 0.64 - 7.61% [946] {min=0.00, max=0.01, mean=0.00, var=0.00, std dev=0.00} cudaMalloc | |-> 0.62 - 7.37% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] EvalEOSForElems A | | |-> 0.54 - 6.33% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.53 - 6.28% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.06% | | |-> 0.02 - 0.19% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.54 - 6.42% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.01 - 0.15% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.05% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.01 - 0.10% | | |-> 0.01 - 0.10% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.01 - 0.07% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.02% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.05% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.04 - 0.53% | |-> 0.62 - 7.30% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] IntegrateStressForElems A | | |-> 0.62 - 7.27% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.62 - 7.26% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.62 - 7.27% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.58 - 6.82% [64] {min=0.00, max=0.28, mean=0.01, var=0.00, std dev=0.03} Kokkos::parallel_for [Cuda, Dev:0] IntegrateStressForElems B | | |-> 0.29 - 3.47% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.29 - 3.47% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.29 - 3.48% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::Cuda> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.28 - 3.33% | |-> 0.57 - 6.70% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcEnergyForElems | | |-> 0.48 - 5.66% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.47 - 5.60% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.05% | | |-> 0.02 - 0.18% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.49 - 5.75% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.01 - 0.15% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.05% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.01 - 0.10% | | |-> 0.01 - 0.10% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.01 - 0.07% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.02% [2240] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.06% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.05 - 0.54% | |-> 0.54 - 6.36% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcFBHourglassForceForElems A | | |-> 0.54 - 6.32% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.54 - 6.32% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.54 - 6.33% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory >, Kokkos::View >, Kokkos::View >, Kokkos::View >, Kokkos::View >, Kokkos::View >, double, int, int)::{lambda(int const&)#1}, Kokkos::RangePolicy, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.52 - 6.12% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainPddEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d28CalcHourglassControlForElemsES2_S3_dEELj1EEFviRiEJS1_N6Kokkos4ViewIPS3_JNSA_12MemoryTraitsILj1EEEEEESF_SF_SF_SF_SF_S3_EE | | |-> 0.51 - 6.08% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.51 - 6.08% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.52 - 6.08% [64] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.32 - 3.78% [29] {min=0.00, max=0.04, mean=0.01, var=0.00, std dev=0.01} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [_mirror] via memset | | |-> 0.00 - 0.01% [29] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [29] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.01% | | Remainder: 0.32 - 3.77% | |-> 0.30 - 3.52% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcFBHourglassForceForElems B | | |-> 0.30 - 3.48% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.29 - 3.48% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.30 - 3.49% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory >, Kokkos::View >, Kokkos::View >, Kokkos::View >, Kokkos::View >, Kokkos::View >, double, int, int)::{lambda(Kokkos::Impl::CudaTeamMember const&)#1}, Kokkos::TeamPolicy<>, Kokkos::Cuda> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.28 - 3.25% [940] {min=0.00, max=0.01, mean=0.00, var=0.00, std dev=0.00} cudaFree | |-> 0.21 - 2.45% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcMonotonicQRegionForElems | | |-> 0.18 - 2.09% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.18 - 2.07% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.01 - 0.09% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.18 - 2.11% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.05% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.03% | | |-> 0.00 - 0.03% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.01 - 0.17% | |-> 0.18 - 2.18% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcKinematicsForElems | | |-> 0.18 - 2.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.18 - 2.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.18 - 2.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.13 - 1.52% [11712] {min=0.00, max=0.09, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | |-> 0.01 - 0.15% [11712] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | Remainder: 0.12 - 1.37% | |-> 0.13 - 1.48% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcMonotonicQGradientsForElems | | |-> 0.12 - 1.44% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.12 - 1.44% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.12 - 1.44% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.11 - 1.31% [23] {min=0.00, max=0.03, mean=0.00, var=0.00, std dev=0.01} Kokkos deep copy: Host _mirror -> Cuda | | |-> 0.11 - 1.29% [23] {min=0.00, max=0.03, mean=0.00, var=0.00, std dev=0.01} cudaMemcpy | | | |-> 0.11 - 1.27% [23] {min=0.00, max=0.03, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.01% [46] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [46] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.01% | |-> 0.11 - 1.31% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainiidRdEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d29CalcCourantConstraintForElemsES2_iidS3_EELj1EEFviR9MinFinderEJS1_idEE | | |-> 0.08 - 0.93% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.08 - 0.91% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.06% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.08 - 0.96% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.05% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.03% | | |-> 0.00 - 0.03% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.02 - 0.20% | |-> 0.10 - 1.22% [12] {min=0.00, max=0.03, mean=0.01, var=0.00, std dev=0.01} Kokkos deep copy: Cuda -> Host _mirror | | |-> 0.10 - 1.22% [12] {min=0.00, max=0.03, mean=0.01, var=0.00, std dev=0.01} cudaMemcpy | | | |-> 0.10 - 1.20% [12] {min=0.00, max=0.03, mean=0.01, var=0.00, std dev=0.01} GPU: Memcpy DtoH | | | Remainder: 0.00 - 0.01% | | |-> 0.00 - 0.00% [24] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [24] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.09 - 1.06% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainiidRdEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d27CalcHydroConstraintForElemsES2_iidS3_EELj1EEFviR9MinFinderEJS1_idEE | | |-> 0.06 - 0.69% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.06 - 0.67% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.06% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.06 - 0.71% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.05% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.03% | | |-> 0.00 - 0.03% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.02 - 0.19% | |-> 0.08 - 0.97% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] EvalEOSForElems F | | |-> 0.05 - 0.65% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.05 - 0.63% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.06% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.06 - 0.67% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.05% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.03% | | |-> 0.00 - 0.03% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.01 - 0.16% | |-> 0.08 - 0.96% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcSoundSpeedForElems | | |-> 0.05 - 0.62% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.05 - 0.60% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.00 - 0.06% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.06 - 0.65% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.05% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.02% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.03% | | |-> 0.00 - 0.04% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.01% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.03% | | |-> 0.00 - 0.03% [704] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.01 - 0.17% | |-> 0.07 - 0.82% [613] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [] via memset | | |-> 0.06 - 0.68% [613] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.06 - 0.66% [613] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.02% | | |-> 0.01 - 0.08% [613] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.06 - 0.68% [613] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.01 - 0.06% | |-> 0.06 - 0.75% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d20CalcLagrangeElementsES2_EELj1EEFviRiEJS1_EE | | |-> 0.06 - 0.72% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.06 - 0.71% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.06 - 0.72% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.06 - 0.73% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcPositionForNodes | | |-> 0.06 - 0.69% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.06 - 0.69% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.06 - 0.70% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.06 - 0.73% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcVelocityForNodes | | |-> 0.06 - 0.69% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.06 - 0.69% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.06 - 0.70% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.05 - 0.58% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcAccelerationForNodes | | |-> 0.05 - 0.54% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.05 - 0.54% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.05 - 0.54% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.04 - 0.52% [1] {min=0.04, max=0.04, mean=0.04, var=0.00, std dev=0.00} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [nodeElemCornerList_mirror] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.04 - 0.52% | |-> 0.04 - 0.42% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] InitStressTermsForElems | | |-> 0.03 - 0.39% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.03 - 0.39% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.03 - 0.39% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.03 - 0.32% [1] {min=0.03, max=0.03, mean=0.03, var=0.00, std dev=0.00} Kokkos deep copy: Host nodeElemCornerList_mirror -> Cuda nodeElemCornerList | | |-> 0.03 - 0.31% [1] {min=0.03, max=0.03, mean=0.03, var=0.00, std dev=0.00} cudaMemcpy | | | |-> 0.03 - 0.31% [1] {min=0.03, max=0.03, mean=0.03, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.02 - 0.23% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] CalcForceForNodes | | |-> 0.02 - 0.19% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.02 - 0.19% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.02 - 0.20% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [3] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbol | | | |-> 0.00 - 0.00% [3] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.02 - 0.19% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d13CalcQForElemsES2_EELj1EEFvRKiRiEJS1_EE | | |-> 0.01 - 0.14% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.01 - 0.14% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.02 - 0.18% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] ApplyMaterialPropertiesForElems A | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.02 - 0.18% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] UpdateVolumesForElems | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.14% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.02 - 0.18% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d31ApplyMaterialPropertiesForElemsES2_EELj4EEFviRiEJS1_ddEE | | |-> 0.01 - 0.14% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.01 - 0.14% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.15% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.01 - 0.16% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_reduce [Cuda, Dev:0] 18__nv_hdl_wrapper_tILb0ELb0E11__nv_dl_tagIPFvR6DomainEXadL_ZN72_INTERNAL_50__tmp_tmpxft_0025c612_00000000_6_lulesh_cudafe1_cpp_c8b50e3d23CalcVolumeForceForElemsES2_EELj1EEFviRiEJN6Kokkos4ViewIPdJEEEEE | | |-> 0.01 - 0.13% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | | |-> 0.01 - 0.13% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.13% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_local_memory, Kokkos::InvalidType, Kokkos::RangePolicy> >(Kokkos::Impl::ParallelReduce<_INTERNAL_019bd3e3_9_lulesh_cc_c8b50e3d::CalcVolumeForceForElems(Domain&)::{lambda(int, int&)#1}, Kokkos::RangePolicy, Kokkos::InvalidType, Kokkos::RangePolicy>) | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.01 - 0.13% [2670] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaStreamSynchronize | | |-> 0.01 - 0.07% [2670] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Stream Synchronize | | Remainder: 0.01 - 0.06% | |-> 0.01 - 0.11% [942] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyAsync | | |-> 0.00 - 0.02% [942] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | Remainder: 0.01 - 0.08% | |-> 0.01 - 0.10% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] ApplyMaterialPropertiesForElems B | | |-> 0.01 - 0.09% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.09% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.09% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_local_memory, Kokkos::RangePolicy> >(Kokkos::Impl::ParallelFor<_INTERNAL_019bd3e3_9_lulesh_cc_c8b50e3d::ApplyMaterialPropertiesForElems(Domain&)::{lambda(int)#2}, Kokkos::RangePolicy, Kokkos::RangePolicy>) | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.10% [2] {min=0.00, max=0.01, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [Buffer] via memset | | |-> 0.01 - 0.10% [2] {min=0.00, max=0.01, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.10% [2] {min=0.00, max=0.01, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.01 - 0.10% [2] {min=0.00, max=0.01, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.00% | |-> 0.01 - 0.09% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] ApplyMaterialPropertiesForElems C | | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_local_memory, Kokkos::RangePolicy> >(Kokkos::Impl::ParallelFor<_INTERNAL_019bd3e3_9_lulesh_cc_c8b50e3d::ApplyMaterialPropertiesForElems(Domain&)::{lambda(int)#3}, Kokkos::RangePolicy, Kokkos::RangePolicy>) | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.09% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [vnewc] via memset | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [sigxx] via memset | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [sigyy] via memset | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [sigzz] via memset | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.08% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [determ] via memset | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.01 - 0.07% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.01% | |-> 0.01 - 0.08% [1] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [m_nodeElemStart_mirror] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.01 - 0.08% | |-> 0.01 - 0.07% [1] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [nodeElemCount] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.01 - 0.07% | |-> 0.01 - 0.06% [1] {min=0.01, max=0.01, mean=0.01, var=0.00, std dev=0.00} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [regElemlist::entries_mirror] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.01 - 0.06% | |-> 0.00 - 0.05% [612] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::ViewCopy-1D | | |-> 0.00 - 0.02% [612] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.01% [612] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.01% | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | Remainder: 0.00 - 0.03% | |-> 0.00 - 0.05% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos deep copy: Host regElemlist::entries_mirror -> Cuda regElemlist::entries | | |-> 0.00 - 0.05% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpy | | | |-> 0.00 - 0.04% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.04% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos deep copy: Host m_nodeElemStart_mirror -> Cuda m_nodeElemStart | | |-> 0.00 - 0.04% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpy | | | |-> 0.00 - 0.04% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.04% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] ApplyAccelerationBoundaryConditionsForNodes A | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.00 - 0.03% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] ApplyAccelerationBoundaryConditionsForNodes B | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.00 - 0.03% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] ApplyAccelerationBoundaryConditionsForNodes C | | |-> 0.00 - 0.01% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_constant_memory, Kokkos::RangePolicy> >() | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbolAsync | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventRecord | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventSynchronize | | | |-> 0.00 - 0.00% [64] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Event Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | Remainder: 0.00 - 0.02% | |-> 0.00 - 0.02% [13] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos deep copy: Host Scalar -> Cuda | | |-> 0.00 - 0.02% [13] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::ViewFill-1D | | | |-> 0.00 - 0.01% [13] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | | |-> 0.00 - 0.01% [13] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | | Remainder: 0.00 - 0.00% | | | |-> 0.00 - 0.00% [13] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | | | |-> 0.00 - 0.01% [12] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_local_memory, Kokkos::MemoryTraits<0u> >, Kokkos::LayoutRight, Kokkos::Cuda, 1, int>, Kokkos::RangePolicy >, Kokkos::Cuda> >(Kokkos::Impl::ParallelFor, Kokkos::MemoryTraits<0u> >, Kokkos::LayoutRight, Kokkos::Cuda, 1, int>, Kokkos::RangePolicy >, Kokkos::Cuda>) | | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: void Kokkos::Impl::cuda_parallel_launch_local_memory, Kokkos::MemoryTraits<0u> >, Kokkos::LayoutRight, Kokkos::Cuda, 1, int>, Kokkos::RangePolicy >, Kokkos::Cuda> >(Kokkos::Impl::ParallelFor, Kokkos::MemoryTraits<0u> >, Kokkos::LayoutRight, Kokkos::Cuda, 1, int>, Kokkos::RangePolicy >, Kokkos::Cuda>) | | | |-> 0.00 - 0.00% [3] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbol | | | | |-> 0.00 - 0.00% [3] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | | Remainder: 0.00 - 0.00% | | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncGetAttributes | | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaFuncSetCacheConfig | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [39] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [39] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.02% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaHostAlloc | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [nodeElemCornerList] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [m_nodeElemStart] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [regElemlist::entries] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaGetDeviceProperties | |-> 0.00 - 0.00% [3] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaLaunchKernel | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: desul::(anonymous namespace)::init_lock_arrays_cuda_kernel() | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Kokkos::(anonymous namespace)::init_lock_array_kernel_atomic() | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Kokkos::Impl::(anonymous namespace)::query_cuda_kernel_arch(int*) | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaGetDeviceCount | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos deep copy: Host regElemlist::row_map_mirror -> Cuda regElemlist::row_map | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpy | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpy | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy DtoH | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMallocHost | |-> 0.00 - 0.00% [5] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemcpyToSymbol | | |-> 0.00 - 0.00% [5] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memcpy HtoD | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::View::initialization [regElemlist::row_map] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemsetAsync | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | | Remainder: 0.00 - 0.00% | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaMemset | | |-> 0.00 - 0.00% [2] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Memset | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaSetDevice | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaEventCreate | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [regBinEnd] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [Cuda, Dev:0] Kokkos::ViewCopy-2D | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} Kokkos::parallel_for [OpenMP] Kokkos::View::initialization [regElemlist::row_map_mirror] via memset | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSynchronize | | | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} GPU: Context Synchronize | | | Remainder: 0.00 - 0.00% | | Remainder: 0.00 - 0.00% | |-> 0.00 - 0.00% [1] {min=0.00, max=0.00, mean=0.00, var=0.00, std dev=0.00} cudaDeviceSetCacheConfig | Remainder: 1.63 - 19.29%