This example demonstrates the best practices for application performance optimizations with oneDNN.
#include <iostream>
#include <stdexcept>
#include <vector>
#include "example_utils.hpp"
void init_data(memory &m, float v) {
size_t size = m.get_desc().get_size() / sizeof(float);
std::vector<float> data(size, v);
write_to_dnnl_memory(data.data(), m);
}
void create_and_execute_relu(memory &data, engine &eng, stream &s) {
auto relu_pd = eltwise_forward::primitive_desc(relu_d, eng);
auto relu = eltwise_forward(relu_pd);
}
primitive_attr create_attr_with_relu_post_op() {
post_ops ops;
primitive_attr attr;
return attr;
}
void conv_relu_naive(const memory &user_src, const memory &user_wei,
memory user_dst, engine &eng, stream &s) {
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
conv_dst_md, strides, padding, padding);
auto conv_pd = convolution_forward::primitive_desc(conv_d, eng);
auto conv = convolution_forward(conv_pd);
conv.execute(s,
create_and_execute_relu(user_dst, eng, s);
s.wait();
}
void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
engine &eng, stream &s) {
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
conv_dst_md, strides, padding, padding);
auto conv_pd = convolution_forward::primitive_desc(conv_d, eng);
memory conv_src = user_src;
if (conv_pd.src_desc() != user_src.get_desc()) {
conv_src = memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
memory conv_wei = user_wei;
if (conv_pd.weights_desc() != user_wei.get_desc()) {
conv_wei = memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
memory conv_dst = user_dst;
if (conv_pd.dst_desc() != user_dst.get_desc())
conv_dst = memory(conv_pd.dst_desc(), eng);
auto conv = convolution_forward(conv_pd);
conv.execute(s,
create_and_execute_relu(conv_dst, eng, s);
if (conv_pd.dst_desc() != user_dst.get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
s.wait();
}
void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
const engine &eng, stream &s) {
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
conv_dst_md, strides, padding, padding);
auto attr = create_attr_with_relu_post_op();
auto conv_pd = convolution_forward::primitive_desc(conv_d, attr, eng);
memory conv_src = user_src;
if (conv_pd.src_desc() != user_src.get_desc()) {
conv_src = memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
memory conv_wei = user_wei;
if (conv_pd.weights_desc() != user_wei.get_desc()) {
conv_wei = memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
memory conv_dst = user_dst;
if (conv_pd.dst_desc() != user_dst.get_desc())
conv_dst = memory(conv_pd.dst_desc(), eng);
auto conv = convolution_forward(conv_pd);
conv.execute(s,
if (conv_pd.dst_desc() != user_dst.get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
s.wait();
}
void performance_profiling(
engine::kind engine_kind,
int argc,
char **argv) {
stream s(eng);
eng);
eng);
eng);
init_data(user_src, 1);
init_data(user_dst, -1);
init_data(user_wei, .5);
std::string implementation;
if (argc <= 2)
implementation = "validation";
else if (argc == 3)
implementation = argv[2];
if (!(implementation == "validation" || implementation == "naive"
|| implementation == "blocked" || implementation == "fused")) {
std::cout << "The implementation can be one of:\n";
std::cout << " - naive: NCHW format without fusion\n";
std::cout << " - blocked: format propagation without fusion\n";
std::cout << " - fused: format propagation with fusion\n";
std::cout << " - validation: runs all implementations\n\n";
std::cout << "Validation will run if no parameters are specified.\n\n";
throw std::invalid_argument("Incorrect input arguments.");
}
if (implementation == "naive" || implementation == "validation") {
std::cout << "Implementation: naive.\n";
conv_relu_naive(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ nchw format completed.\n";
}
if (implementation == "blocked" || implementation == "validation") {
std::cout << "Implementation: blocked.\n";
conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ blocked format completed.\n";
}
if (implementation == "fused" || implementation == "validation") {
std::cout << "Implementation: fused.\n";
conv_relu_fused(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ fusing completed.\n";
}
}
int main(int argc, char **argv) {
engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
return handle_example_errors(
performance_profiling, engine_kind, argc, argv);
}
@ convolution_direct
Direct convolution.
@ eltwise_relu
Elementwise: rectified linear unit (ReLU)
@ forward_inference
Forward data propagation (inference mode).
@ dnnl_format_kind_any
Unspecified format kind.
Definition: dnnl_types.h:85
#define DNNL_ARG_DST
A special mnemonic for destination argument for primitives that have a single destination.
Definition: dnnl_types.h:2422
#define DNNL_ARG_SRC
A special mnemonic for source argument for primitives that have a single source.
Definition: dnnl_types.h:2398
#define DNNL_ARG_WEIGHTS
A special mnemonic for primitives that have a single weights argument.
Definition: dnnl_types.h:2445
oneDNN namespace
Definition: dnnl.hpp:74
kind
Kinds of engines.
Definition: dnnl.hpp:900
dnnl_dim_t dim
Integer type for representing dimension sizes and indices.
Definition: dnnl.hpp:1138
@ oihw
4D CNN weights tensor; an alias for dnnl::memory::format_tag::abcd
@ nchw
4D CNN activations tensor; an alias for dnnl::memory::format_tag::abcd
@ f32
32-bit/single-precision floating point.
std::vector< dim > dims
Vector of dimensions.
Definition: dnnl.hpp:1141
void set_post_ops(const post_ops ops)
Sets post-ops.
Definition: dnnl.hpp:3094