Add VideoEffects SDK

2 months ago · a70db4da89
commit a70db4da89
320 changed files with 133182 additions and 0 deletions
--- a/3
+++ b/3
@ -0,0 +1,3 @@
+Changelog (v0.7.2.0)
+--------------------
+  - AI Green Screen - Latency improvements
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,97 @@
+cmake_minimum_required(VERSION 3.10)
+
+# Set path where samples will be installed
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR} CACHE PATH "Path to where the samples will be installed")
+option(INSTALL_SDK "Install binaries into the samples folder" OFF)
+
+project(NvVideoEffects_SDK CXX)
+
+set(CMAKE_CONFIGURATION_TYPES "Release")
+
+# Require C++11 and disable non-standard extensions
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+add_definitions(-DNOMINMAX -DWIN32_LEAN_AND_MEAN)
+
+# Set common build path for all targets
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+if(MSVC)
+
+    set(SDK_INCLUDES_PATH ${CMAKE_CURRENT_SOURCE_DIR}/nvvfx/include)
+    # Add target for NVVideoEffects
+    add_library(NVVideoEffects INTERFACE)
+    target_include_directories(NVVideoEffects INTERFACE ${SDK_INCLUDES_PATH})
+
+else()
+    # Add target for NVVideoEffects
+    add_library(NVVideoEffects INTERFACE)
+
+    # found in different locations depending on type of package
+    find_path(VideoFX_INCLUDES
+        NAMES nvVideoEffects.h
+        PATHS
+        /usr/local/VideoFX/include
+        /usr/include/x86_64-linux-gnu
+        /usr/include
+        REQUIRED
+        )
+
+    target_include_directories(NVVideoEffects INTERFACE ${VideoFX_INCLUDES})
+    set(SDK_INCLUDES_PATH ${VideoFX_INCLUDES})
+
+    find_library(VideoFX_LIB
+        NAMES libVideoFX.so
+        PATHS
+        /usr/local/VideoFX/lib
+        /usr/lib/x86_64-linux-gnu
+        /usr/lib64
+        /usr/lib
+        REQUIRED
+        NO_DEFAULT_PATH)
+
+    target_link_libraries(NVVideoEffects INTERFACE "${VideoFX_LIB}")
+
+    message(STATUS "VideoFX_LIB: ${VideoFX_LIB}")
+    message(STATUS "SDK_INCLUDES_PATH: ${SDK_INCLUDES_PATH}")
+    
+    
+    # Add target for NVCVImage
+    add_library(NVCVImage INTERFACE)
+
+    # found in different locations depending on type of package
+    find_path(NVCVImage_INCLUDES
+        NAMES nvCVImage.h
+        PATHS
+        /usr/local/VideoFX/include
+        /usr/include/x86_64-linux-gnu
+        /usr/include
+        REQUIRED
+        )
+
+    target_include_directories(NVCVImage INTERFACE ${NVCVImage_INCLUDES})
+
+
+    find_library(NVCVImage_LIB
+        NAMES libNVCVImage.so
+        PATHS
+        /usr/local/VideoFX/lib
+        /usr/lib/x86_64-linux-gnu
+        /usr/lib64
+        /usr/lib
+        REQUIRED
+        NO_DEFAULT_PATH)
+
+    target_link_libraries(NVCVImage INTERFACE "${NVCVImage_LIB}")
+
+    message(STATUS "NVCVImage_LIB: ${NVCVImage_LIB}")
+    message(STATUS "NVCVImage_INCLUDES_PATH: ${NVCVImage_INCLUDES}")
+
+endif()
+
+add_subdirectory(samples)
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2021 NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.MD
+++ b/README.MD
@ -0,0 +1,87 @@
+# README
+## NVIDIA MAXINE VideoEffects SDK: API Source Code and Sample Applications
+
+NVIDIA MAXINE Video Effects SDK enables AI-based visual effects that run with standard webcam input and can easily be integrated into video conference and content creation pipelines. The underlying deep learning models are optimized with NVIDIA AI using NVIDIA® TensorRT™ for high-performance inference, making it possible for developers to apply multiple effects in real-time applications.
+
+The SDK has the following AI features:
+
+- **Virtual Background**,which segments and masks the background areas in a video or image to enable AI-powered background removal, replacement, or blur.
+- **Artifact Reduction**, which reduces compression artifacts from an encoded video while preserving the details of the original video.
+- **Super Resolution**, which generates a detail-enhanced video with up to 4X high-quality scaling, while also reducing blocky/noisy artifacts and preserving textures and content. It is suitable for upscaling lossy content.
+- **Upscaler**, which is a very fast and light-weight method to deliver up to 4X high-quality scaled video with an adjustable sharpening parameter. This feature can be optionally pipelined with the Artifact Reduction feature to enhance the scale while reducing the video artifacts.
+- **Video Noise Removal**, which removes low-light camera noise from a webcam video while preserving the texture details.
+
+<p align="center">
+<img src="https://github.com/NVIDIA/MAXINE-VFX-SDK/blob/master/resources/SR.gif" alt="NVIDIA Super Resolution" width="640" height="320"/>
+ </p>
+
+<p align="center">
+<img src="https://github.com/NVIDIA/MAXINE-VFX-SDK/blob/master/resources/Denoise.gif" alt="NVIDIA Video Noise Removal" width="640" height="320"/>
+ </p>
+
+The SDK provides several sample applications that demonstrate the features listed above in real time by using offline videos.
+- **AI Green Screen App**, which is a sample app that demonstrates the Virtual background feature.
+- **VideoEffects App**, which is a sample app that can invoke each of Artifact Reduction, Super Resolution or Upscaler features individually.
+- **UpscalePipeline App**, which is a sample app that pipelines the Artifact Reduction feature with the Upscaler feature.
+- **DenoiseEffect App**, which is a sample app that demonstrates the Video Noise Removal feature.
+ 
+The input and output resolutions supported by the features of the SDK are listed below.
+- The Artifact Reduction feature supports between 90p to 1080p as input resolutions. 
+- The Super Resolution feature supports between 90p to 2160p as input resolutions.
+   - Super Resolution supports the following scaling factors: 4/3x (~1.33x), 1.5x, 2x, 3x and 4x.
+   - 2160p input is only supported for the following scaling factors: 4/3x (~1.33x), 1.5x and 2x
+   - The maximum output resolution for the Super Resolution feature is 4320p.
+- The Upscaler feature supports any input resolution, and the following scaling factors: 4/3x (~1.33x), 1.5x, 2x, 3x  and 4x.
+- The Video Noise Removal feature supports between 80p to 1080p as input resolutions.
+- The Virtual Background and Background Blur features require that an input image/video be at least 288 pixels high.
+
+NVIDIA MAXINE VideoEffects SDK is distributed in the following parts:
+
+- This open source repository that includes the [SDK API and proxy linking source code](https://github.com/NVIDIA/MAXINE-VFX-SDK/tree/master/nvvfx), and [sample applications and their dependency libraries](https://github.com/NVIDIA/MAXINE-VFX-SDK/tree/master/samples).
+- An installer hosted on [NVIDIA Maxine End-user Redistributables page](https://www.nvidia.com/broadcast-sdk-resources) that installs the SDK DLLs, the models, and the SDK dependency libraries.
+
+Please refer to the [SDK System guide](https://docs.nvidia.com/deeplearning/maxine/vfx-sdk-system-guide/) for configuring and integrating the SDK, compiling and running the sample applications. Please visit the [NVIDIA MAXINE Video Effects SDK](https://developer.nvidia.com/maxine-getting-started) webpage for more information about the SDK.
+
+## System requirements
+The SDK is supported on NVIDIA GPUs that are based on the NVIDIA® Turing™, Ampere™ or Ada™ architecture and have Tensor Cores.
+
+* Windows OS supported: 64-bit Windows 10 or later
+* Microsoft Visual Studio: 2017 (MSVC15.0) or later
+* CMake: v3.12 or later
+* NVIDIA Graphics Driver for Windows: 511.65 or later
+
+## NVIDIA MAXINE Branding Guidelines
+If you integrate an NVIDIA MAXINE SDK within your product, please follow the required branding guidelines that are available [here](https://www.nvidia.com/maxine-sdk-guidelines/)
+
+## Compiling the sample apps
+
+### Steps
+
+The open source repository includes the source code to build the sample applications, and a proxy file NVVideoEffectsProxy.cpp to enable compilation without explicitly linking against the SDK DLL.
+
+**Note: To download the models and runtime dependencies required by the features, you need to run the [SDK Installer](https://www.nvidia.com/broadcast-sdk-resources).**
+
+1.	In the root folder of the downloaded source code, start the CMake GUI and specify the source folder and a build folder for the binary files.
+*	For the source folder, ensure that the path ends in OSS.
+*	For the build folder, ensure that the path ends in OSS/build.
+2.  Use CMake to configure and generate the Visual Studio solution file.
+*	Click Configure.
+*	When prompted to confirm that CMake can create the build folder, click OK.
+*	Select Visual Studio for the generator and x64 for the platform.
+*	To complete configuring the Visual Studio solution file, click Finish.
+*	To generate the Visual Studio Solution file, click Generate.
+*	Verify that the build folder contains the NvVideoEffects_SDK.sln file.
+3.  Use Visual Studio to generate the application binary .exe file from the NvVideoEffects_SDK.sln file.
+*	In CMake, to open Visual Studio, click Open Project.
+*	In Visual Studio, select Build > Build Solution.
+
+## Documentation
+Please refer to the online documentation guides -
+* [NVIDIA Video Effects SDK Programming Guide](https://docs.nvidia.com/deeplearning/maxine/vfx-sdk-programming-guide/index.html)
+* [NVIDIA Video Effects SDK System Guide](https://docs.nvidia.com/deeplearning/maxine/vfx-sdk-system-guide/index.html)
+* [NvCVImage API Guide](https://docs.nvidia.com/deeplearning/maxine/nvcvimage-api-guide/index.html)
+
+PDF versions of these guides are also available at the following locations - 
+* [NVIDIA Video Effects SDK Programming Guide](https://docs.nvidia.com/deeplearning/maxine/pdf/vfx-sdk-programming-guide.pdf)
+* [NVIDIA Video Effects SDK System Guide](https://docs.nvidia.com/deeplearning/maxine/pdf/vfx-sdk-system-guide.pdf)
+* [NvCVImage API Guide](https://docs.nvidia.com/deeplearning/maxine/pdf/nvcvimage-api-guide.pdf)
--- a/nvvfx/NVOSSLicense.txt
+++ b/nvvfx/NVOSSLicense.txt
@ -0,0 +1,25 @@
+The contents of this folder are governed by the MIT license
+
+Copyright (C) 2019, NVIDIA Corporation, all rights reserved.
+
+                       MIT License
+					   
+Permission is hereby granted, free of charge, to any person 
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/nvvfx/include/nvCVImage.h
+++ b/nvvfx/include/nvCVImage.h
@ -0,0 +1,756 @@
+/*###############################################################################
+#
+# Copyright 2020-2021 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#ifndef __NVCVIMAGE_H__
+#define __NVCVIMAGE_H__
+
+#include "nvCVStatus.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // ___cplusplus
+
+
+#ifndef   RTX_CAMERA_IMAGE    // Compile with -DRTX_CAMERA_IMAGE=0 to get more functionality and bug fixes.
+  #define RTX_CAMERA_IMAGE  0 // Set to 1 for RTXCamera, which needs an old version, that avoids new functionality
+#endif // RTX_CAMERA_IMAGE
+
+
+struct CUstream_st;   // typedef struct CUstream_st *CUstream;
+
+//! The format of pixels in an image.
+typedef enum NvCVImage_PixelFormat {
+  NVCV_FORMAT_UNKNOWN  = 0,    //!< Unknown pixel format.
+  NVCV_Y               = 1,    //!< Luminance (gray).
+  NVCV_A               = 2,    //!< Alpha (opacity)
+  NVCV_YA              = 3,    //!< { Luminance, Alpha }
+  NVCV_RGB             = 4,    //!< { Red, Green, Blue }
+  NVCV_BGR             = 5,    //!< { Red, Green, Blue }
+  NVCV_RGBA            = 6,    //!< { Red, Green, Blue, Alpha }
+  NVCV_BGRA            = 7,    //!< { Red, Green, Blue, Alpha }
+#if RTX_CAMERA_IMAGE
+  NVCV_YUV420          = 8,    //!< Luminance and subsampled Chrominance { Y, Cb, Cr }
+  NVCV_YUV422          = 9,    //!< Luminance and subsampled Chrominance { Y, Cb, Cr }
+#else // !RTX_CAMERA_IMAGE
+  NVCV_ARGB            = 8,    //!< { Red, Green, Blue, Alpha }
+  NVCV_ABGR            = 9,    //!< { Red, Green, Blue, Alpha }
+  NVCV_YUV420          = 10,   //!< Luminance and subsampled Chrominance { Y, Cb, Cr }
+  NVCV_YUV422          = 11,   //!< Luminance and subsampled Chrominance { Y, Cb, Cr }
+#endif // !RTX_CAMERA_IMAGE
+  NVCV_YUV444          = 12,   //!< Luminance and full bandwidth Chrominance { Y, Cb, Cr }
+} NvCVImage_PixelFormat;
+
+
+//! The data type used to represent each component of an image.
+typedef enum NvCVImage_ComponentType {
+  NVCV_TYPE_UNKNOWN  = 0,      //!< Unknown type of component.
+  NVCV_U8            = 1,      //!< Unsigned 8-bit integer.
+  NVCV_U16           = 2,      //!< Unsigned 16-bit integer.
+  NVCV_S16           = 3,      //!< Signed 16-bit integer.
+  NVCV_F16           = 4,      //!< 16-bit floating-point.
+  NVCV_U32           = 5,      //!< Unsigned 32-bit integer.
+  NVCV_S32           = 6,      //!< Signed 32-bit integer.
+  NVCV_F32           = 7,      //!< 32-bit floating-point (float).
+  NVCV_U64           = 8,      //!< Unsigned 64-bit integer.
+  NVCV_S64           = 9,      //!< Signed 64-bit integer.
+  NVCV_F64           = 10,     //!< 64-bit floating-point (double).
+} NvCVImage_ComponentType;
+
+
+//! Value for the planar field or layout argument. Two values are currently accommodated for RGB:
+//! Interleaved or chunky storage locates all components of a pixel adjacent in memory,
+//! e.g. RGBRGBRGB... (denoted [RGB]).
+//! Planar storage locates the same component of all pixels adjacent in memory,
+//! e.g. RRRRR...GGGGG...BBBBB... (denoted [R][G][B])
+//! YUV has many more variants.
+//! 4:2:2 can be chunky, planar or semi-planar, with different orderings.
+//! 4:2:0 can be planar or semi-planar, with different orderings.
+//! Aliases are provided for FOURCCs defined at fourcc.org.
+//! Note: the LSB can be used to distinguish between chunky and planar formats.
+#define NVCV_INTERLEAVED   0   //!< All components of pixel(x,y) are adjacent (same as chunky) (default for non-YUV).
+#define NVCV_CHUNKY        0   //!< All components of pixel(x,y) are adjacent (same as interleaved).
+#define NVCV_PLANAR        1   //!< The same component of all pixels are adjacent.
+#define NVCV_UYVY          2   //!< [UYVY]    Chunky 4:2:2 (default for 4:2:2)
+#define NVCV_VYUY          4   //!< [VYUY]    Chunky 4:2:2
+#define NVCV_YUYV          6   //!< [YUYV]    Chunky 4:2:2
+#define NVCV_YVYU          8   //!< [YVYU]    Chunky 4:2:2
+#define NVCV_CYUV         10   //!< [YUV]     Chunky 4:4:4
+#define NVCV_CYVU         12   //!< [YVU]     Chunky 4:4:4
+#define NVCV_YUV           3   //!< [Y][U][V] Planar 4:2:2 or 4:2:0 or 4:4:4
+#define NVCV_YVU           5   //!< [Y][V][U] Planar 4:2:2 or 4:2:0 or 4:4:4
+#define NVCV_YCUV          7   //!< [Y][UV]   Semi-planar 4:2:2 or 4:2:0 (default for 4:2:0)
+#define NVCV_YCVU          9   //!< [Y][VU]   Semi-planar 4:2:2 or 4:2:0
+
+//! The following are FOURCC aliases for specific layouts. Note that it is still required to specify the format as well
+//! as the layout, e.g. NVCV_YUV420 and NVCV_NV12, even though the NV12 layout is only associated with YUV420 sampling.
+#define NVCV_I420  NVCV_YUV    //!< [Y][U][V] Planar 4:2:0
+#define NVCV_IYUV  NVCV_YUV    //!< [Y][U][V] Planar 4:2:0
+#define NVCV_YV12  NVCV_YVU    //!< [Y][V][U] Planar 4:2:0
+#define NVCV_NV12  NVCV_YCUV   //!< [Y][UV]   Semi-planar 4:2:0 (default for 4:2:0)
+#define NVCV_NV21  NVCV_YCVU   //!< [Y][VU]   Semi-planar 4:2:0
+#define NVCV_YUY2  NVCV_YUYV   //!< [YUYV]    Chunky 4:2:2
+#define NVCV_I444  NVCV_YUV    //!< [Y][U][V] Planar 4:4:4
+#define NVCV_YM24  NVCV_YUV    //!< [Y][U][V] Planar 4:4:4
+#define NVCV_YM42  NVCV_YVU    //!< [Y][V][U] Planar 4:4:4
+#define NVCV_NV24  NVCV_YCUV   //!< [Y][UV]   Semi-planar 4:4:4
+#define NVCV_NV42  NVCV_YCVU   //!< [Y][VU]   Semi-planar 4:4:4
+
+//! The following are ORed together for the colorspace field for YUV.
+//! NVCV_601 and NVCV_709 describe the color axes of YUV.
+//! NVCV_VIDEO_RANGE and NVCV_VIDEO_RANGE describe the range, [16, 235] or [0, 255], respectively.
+//! NVCV_CHROMA_COSITED and NVCV_CHROMA_INTSTITIAL describe the location of the chroma samples.
+#define NVCV_601               0x00   //!< The Rec.601  YUV colorspace, typically used for SD.
+#define NVCV_709               0x01   //!< The Rec.709  YUV colorspace, typically used for HD.
+#define NVCV_2020              0x02   //!< The Rec.2020 YUV colorspace.
+#define NVCV_VIDEO_RANGE       0x00   //!< The video range is [16, 235].
+#define NVCV_FULL_RANGE        0x04   //!< The video range is [ 0, 255].
+#define NVCV_CHROMA_COSITED    0x00   //!< The chroma is sampled at the same location as the luma samples horizontally.
+#define NVCV_CHROMA_INTSTITIAL 0x08   //!< The chroma is sampled between luma samples horizontally.
+#define NVCV_CHROMA_TOPLEFT    0x10   //!< The chroma is sampled at the same location as the luma samples horizontally and vertically.
+#define NVCV_CHROMA_MPEG2      NVCV_CHROMA_COSITED        //!< As is most video.
+#define NVCV_CHROMA_MPEG1      NVCV_CHROMA_INTSTITIAL
+#define NVCV_CHROMA_JPEG       NVCV_CHROMA_INTSTITIAL
+#define NVCV_CHROMA_H261       NVCV_CHROMA_INTSTITIAL
+#define NVCV_CHROMA_INTERSTITIAL  NVCV_CHROMA_INTSTITIAL  //!< Correct spelling
+
+//! This is the value for the gpuMem field or the memSpace argument.
+#define NVCV_CPU          0   //!< The buffer is stored in CPU memory.
+#define NVCV_GPU          1   //!< The buffer is stored in CUDA memory.
+#define NVCV_CUDA         1   //!< The buffer is stored in CUDA memory.
+#define NVCV_CPU_PINNED   2   //!< The buffer is stored in pinned CPU memory.
+#define NVCV_CUDA_ARRAY   3   //!< A CUDA array is used for storage.
+
+//! Image descriptor.
+typedef struct
+#ifdef _MSC_VER
+__declspec(dllexport)
+#endif // _MSC_VER
+NvCVImage {
+  unsigned int              width;                  //!< The number of pixels horizontally in the image.
+  unsigned int              height;                 //!< The number of pixels  vertically  in the image.
+  signed int                pitch;                  //!< The byte stride between pixels vertically.
+  NvCVImage_PixelFormat     pixelFormat;            //!< The format of the pixels in the image.
+  NvCVImage_ComponentType   componentType;          //!< The data type used to represent each component of the image.
+  unsigned char             pixelBytes;             //!< The number of bytes in a chunky pixel.
+  unsigned char             componentBytes;         //!< The number of bytes in each pixel component.
+  unsigned char             numComponents;          //!< The number of components in each pixel.
+  unsigned char             planar;                 //!< NVCV_CHUNKY, NVCV_PLANAR, NVCV_UYVY, ....
+  unsigned char             gpuMem;                 //!< NVCV_CPU, NVCV_CPU_PINNED, NVCV_CUDA, NVCV_GPU
+  unsigned char             colorspace;             //!< An OR of colorspace, range and chroma phase.
+  unsigned char             reserved[2];            //!< For structure padding and future expansion. Set to 0.
+  void                      *pixels;                //!< Pointer to pixel(0,0) in the image.
+  void                      *deletePtr;             //!< Buffer memory to be deleted (can be NULL).
+  void                      (*deleteProc)(void *p); //!< Delete procedure to call rather than free().
+  unsigned long long        bufferBytes;            //!< The maximum amount of memory available through pixels.
+
+
+#ifdef __cplusplus
+
+  //! Default constructor: fill with 0.
+  inline NvCVImage();
+
+  //! Allocation constructor.
+  //! \param[in]  width     the number of pixels horizontally.
+  //! \param[in]  height    the number of pixels vertically.
+  //! \param[in]  format    the format of the pixels.
+  //! \param[in]  type      the type of each pixel component.
+  //! \param[in]  layout    One of { NVCV_CHUNKY, NVCV_PLANAR } or one of the YUV layouts.
+  //! \param[in]  memSpace  One of { NVCV_CPU, NVCV_CPU_PINNED, NVCV_GPU, NVCV_CUDA }
+  //! \param[in]  alignment row byte alignment. Choose 0 or a power of 2.
+  //!                       1: yields no gap whatsoever between scanlines;
+  //!                       0: default alignment: 4 on CPU, and cudaMallocPitch's choice on GPU.
+  //!                       Other common values are 16 or 32 for cache line size.
+  inline NvCVImage(unsigned width, unsigned height, NvCVImage_PixelFormat format, NvCVImage_ComponentType type,
+          unsigned layout = NVCV_CHUNKY, unsigned memSpace = NVCV_CPU, unsigned alignment = 0);
+
+  //! Subimage constructor.
+  //! \param[in]  fullImg   the full image, from which this subImage view is to be created.
+  //! \param[in]  x         the left edge of the subImage, in reference to the full image.
+  //! \param[in]  y         the top edge  of the subImage, in reference to the full image.
+  //! \param[in]  width     the width  of the subImage, in pixels.
+  //! \param[in]  height    the height of the subImage, in pixels.
+  //! \bug        This does not work for planar or semi-planar formats, neither RGB nor YUV.
+  //! \note       This does work for all chunky formats, including UYVY, VYUY, YUYV, YVYU.
+  inline NvCVImage(NvCVImage *fullImg, int x, int y, unsigned width, unsigned height);
+
+  //! Destructor
+  inline ~NvCVImage();
+
+  //! Copy a rectangular subimage. This works for CPU->CPU, CPU->GPU, GPU->GPU, and GPU->CPU.
+  //! \param[in]  src     The source image from which to copy.
+  //! \param[in]  srcX    The left coordinate of the src rectangle.
+  //! \param[in]  srcY    The top  coordinate of the src rectangle.
+  //! \param[in]  dstX    The left coordinate of the dst rectangle.
+  //! \param[in]  dstY    The top  coordinate of the dst rectangle.
+  //! \param[in]  width   The width  of the rectangle to be copied, in pixels.
+  //! \param[in]  height  The height of the rectangle to be copied, in pixels.
+  //! \param[in]  stream  the CUDA stream.
+  //! \note   NvCVImage_Transfer() can handle more cases.
+  //! \return NVCV_SUCCESS         if successful
+  //! \return NVCV_ERR_MISMATCH    if the formats are different
+  //! \return NVCV_ERR_CUDA        if a CUDA error occurred
+  //! \return NVCV_ERR_PIXELFORMAT if the pixel format is not yet accommodated.
+  inline NvCV_Status copyFrom(const NvCVImage *src, int srcX, int srcY, int dstX, int dstY,
+                              unsigned width, unsigned height, struct CUstream_st* stream = 0);
+
+  //! Copy from one image to another. This works for CPU->CPU, CPU->GPU, GPU->GPU, and GPU->CPU.
+  //! \param[in]  src     The source image from which to copy.
+  //! \param[in]  stream  the CUDA stream.
+  //! \note   NvCVImage_Transfer() can handle more cases.
+  //! \return NVCV_SUCCESS         if successful
+  //! \return NVCV_ERR_MISMATCH    if the formats are different
+  //! \return NVCV_ERR_CUDA        if a CUDA error occurred
+  //! \return NVCV_ERR_PIXELFORMAT if the pixel format is not yet accommodated.
+  inline NvCV_Status copyFrom(const NvCVImage *src, struct CUstream_st* stream = 0);
+
+#endif // ___cplusplus
+} NvCVImage;
+
+
+//! Integer rectangle.
+typedef struct NvCVRect2i   {
+  int x;      //!< The left edge of the rectangle.
+  int y;      //!< The top  edge of the rectangle.
+  int width;  //!< The width  of the rectangle.
+  int height; //!< The height of the rectangle.
+} NvCVRect2i;
+
+
+//! Integer point.
+typedef struct NvCVPoint2i {
+  int x;  //!< The horizontal coordinate.
+  int y;  //!< The vertical coordinate
+} NvCVPoint2i;
+
+
+//! Initialize an image. The C++ constructors can initialize this appropriately.
+//! This is called by the C++ constructor, but C code should call this explicitly.
+//! \param[in,out]  im        the image to initialize.
+//! \param[in]      width     the desired width  of the image, in pixels.
+//! \param[in]      height    the desired height of the image, in pixels.
+//! \param[in]      pitch     the byte stride between pixels vertically.
+//! \param[in]      pixels    a pointer to the pixel buffer.
+//! \param[in]      format    the format of the pixels.
+//! \param[in]      type      the type of the components of the pixels.
+//! \param[in]      layout    One of { NVCV_CHUNKY, NVCV_PLANAR } or one of the YUV layouts.
+//! \param[in]      memSpace  Location of the buffer: one of { NVCV_CPU, NVCV_CPU_PINNED, NVCV_GPU, NVCV_CUDA }
+//! \return NVCV_SUCCESS         if successful
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not yet accommodated.
+NvCV_Status NvCV_API NvCVImage_Init(NvCVImage *im, unsigned width, unsigned height, int pitch, void *pixels,
+  NvCVImage_PixelFormat format, NvCVImage_ComponentType type, unsigned layout, unsigned memSpace);
+
+
+//! Initialize a view into a subset of an existing image.
+//! No memory is allocated -- the fullImg buffer is used.
+//! \param[in]  subImg  the sub-image view into the existing full image.
+//! \param[in]  fullImg the existing full image.
+//! \param[in]  x       the left edge of the sub-image, as coordinate of the full image.
+//! \param[in]  y       the top  edge of the sub-image, as coordinate of the full image.
+//! \param[in]  width   the desired width  of the subImage, in pixels.
+//! \param[in]  height  the desired height of the subImage, in pixels.
+//! \bug        This does not work in general for planar or semi-planar formats, neither RGB nor YUV.
+//!             However, it does work for all formats with the full image, to make a shallow copy, e.g.
+//!             NvCVImage_InitView(&subImg, &fullImg, 0, 0, fullImage.width, fullImage.height).
+//!             Cropping a planar or semi-planar image can be accomplished with NvCVImage_TransferRect().
+//! \note       This does work for all chunky formats, including UYVY, VYUY, YUYV, YVYU.
+//! \sa         { NvCVImage_TransferRect }
+void NvCV_API NvCVImage_InitView(NvCVImage *subImg, NvCVImage *fullImg, int x, int y, unsigned width, unsigned height);
+
+
+//! Allocate memory for, and initialize an image. This assumes that the image data structure has nothing meaningful in it.
+//! This is called by the C++ constructor, but C code can call this to allocate an image.
+//! \param[in,out]  im        the image to initialize.
+//! \param[in]      width     the desired width  of the image, in pixels.
+//! \param[in]      height    the desired height of the image, in pixels.
+//! \param[in]      format    the format of the pixels.
+//! \param[in]      type      the type of the components of the pixels.
+//! \param[in]      layout    One of { NVCV_CHUNKY, NVCV_PLANAR } or one of the YUV layouts.
+//! \param[in]      memSpace  Location of the buffer: one of { NVCV_CPU, NVCV_CPU_PINNED, NVCV_GPU, NVCV_CUDA }
+//! \param[in]      alignment row byte alignment. Choose 0 or a power of 2.
+//!                           1: yields no gap whatsoever between scanlines;
+//!                           0: default alignment: 4 on CPU, and cudaMallocPitch's choice on GPU.
+//!                           Other common values are 16 or 32 for cache line size.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MEMORY      if there is not enough memory to allocate the buffer.
+NvCV_Status NvCV_API NvCVImage_Alloc(NvCVImage *im, unsigned width, unsigned height, NvCVImage_PixelFormat format,
+  NvCVImage_ComponentType type, unsigned layout, unsigned memSpace, unsigned alignment);
+
+
+//! Reallocate memory for, and initialize an image. This assumes that the image is valid.
+//! It will check bufferBytes to see if enough memory is already available, and will reshape rather than realloc if true.
+//! Otherwise, it will free the previous buffer and reallocate a new one.
+//! \param[in,out]  im        the image to initialize.
+//! \param[in]      width     the desired width  of the image, in pixels.
+//! \param[in]      height    the desired height of the image, in pixels.
+//! \param[in]      format    the format of the pixels.
+//! \param[in]      type      the type of the components of the pixels.
+//! \param[in]      layout    One of { NVCV_CHUNKY, NVCV_PLANAR } or one of the YUV layouts.
+//! \param[in]      memSpace  Location of the buffer: one of { NVCV_CPU, NVCV_CPU_PINNED, NVCV_GPU, NVCV_CUDA }
+//! \param[in]      alignment row byte alignment. Choose 0 or a power of 2.
+//!                           1: yields no gap whatsoever between scanlines;
+//!                           0: default alignment: 4 on CPU, and cudaMallocPitch's choice on GPU.
+//!                           Other common values are 16 or 32 for cache line size.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MEMORY      if there is not enough memory to allocate the buffer.
+NvCV_Status NvCV_API NvCVImage_Realloc(NvCVImage *im, unsigned width, unsigned height, NvCVImage_PixelFormat format,
+  NvCVImage_ComponentType type, unsigned layout, unsigned memSpace, unsigned alignment);
+
+
+//! Deallocate the image buffer from the image. The image is not deallocated.
+//! param[in,out] im  the image whose buffer is to be deallocated.
+void NvCV_API NvCVImage_Dealloc(NvCVImage *im);
+
+
+//! Deallocate the image buffer from the image asynchronously on the specified stream. The image is not deallocated.
+//! param[in,out] im      the image whose buffer is to be deallocated.
+//! param[int]    stream  the CUDA stream on which the image buffer is to be deallocated..
+void NvCV_API NvCVImage_DeallocAsync(NvCVImage *im, struct CUstream_st *stream);
+
+
+//! Allocate a new image, with storage (C-style constructor).
+//! \param[in]      width     the desired width  of the image, in pixels.
+//! \param[in]      height    the desired height of the image, in pixels.
+//! \param[in]      format    the format of the pixels.
+//! \param[in]      type      the type of the components of the pixels.
+//! \param[in]      layout    One of { NVCV_CHUNKY, NVCV_PLANAR } or one of the YUV layouts.
+//! \param[in]      memSpace  Location of the buffer: one of { NVCV_CPU, NVCV_CPU_PINNED, NVCV_GPU, NVCV_CUDA }
+//! \param[in]      alignment row byte alignment. Choose 0 or a power of 2.
+//!                           1: yields no gap whatsoever between scanlines;
+//!                           0: default alignment: 4 on CPU, and cudaMallocPitch's choice on GPU.
+//!                           Other common values are 16 or 32 for cache line size.
+//! \param[out]         *out will be a pointer to the new image if successful; otherwise NULL.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MEMORY      if there is not enough memory to allocate the buffer.
+NvCV_Status NvCV_API NvCVImage_Create(unsigned width, unsigned height, NvCVImage_PixelFormat format,
+  NvCVImage_ComponentType type, unsigned layout, unsigned memSpace, unsigned alignment, NvCVImage **out);
+
+
+//! Deallocate the image allocated with NvCVImage_Create() (C-style destructor).
+void NvCV_API NvCVImage_Destroy(NvCVImage *im);
+
+
+//! Get offsets for the components of a pixel format.
+//! These are not byte offsets, but component offsets.
+//! \param[in]  format  the pixel format to be interrogated.
+//! \param[out] rOff    a place to store the offset for the red       channel (can be NULL).
+//! \param[out] gOff    a place to store the offset for the green     channel (can be NULL).
+//! \param[out] bOff    a place to store the offset for the blue      channel (can be NULL).
+//! \param[out] aOff    a place to store the offset for the alpha     channel (can be NULL).
+//! \param[out] yOff    a place to store the offset for the luminance channel (can be NULL).
+void NvCV_API NvCVImage_ComponentOffsets(NvCVImage_PixelFormat format, int *rOff, int *gOff, int *bOff, int *aOff, int *yOff);
+
+
+//! Transfer one image to another, with a limited set of conversions.
+//!
+//! If any of the images resides on the GPU, it may run asynchronously,
+//! so cudaStreamSynchronize() should be called if it is necessary to run synchronously.
+//! The following table indicates (with X) the currently-implemented conversions:
+//!    +-------------------+-------------+-------------+-------------+-------------+
+//!    |                   |  u8 --> u8  |  u8 --> f32 | f32 --> u8  | f32 --> f32 |
+//!    +-------------------+-------------+-------------+-------------+-------------+
+//!    | Y      --> Y      |      X      |             |      X      |      X      |
+//!    | Y      --> A      |      X      |             |      X      |      X      |
+//!    | Y      --> RGB    |      X      |      X      |      X      |      X      |
+//!    | Y      --> RGBA   |      X      |      X      |      X      |      X      |
+//!    | A      --> Y      |      X      |             |      X      |      X      |
+//!    | A      --> A      |      X      |             |      X      |      X      |
+//!    | A      --> RGB    |      X      |      X      |      X      |      X      |
+//!    | A      --> RGBA   |      X      |             |             |             |
+//!    | RGB    --> Y      |      X      |      X      |             |             |
+//!    | RGB    --> A      |      X      |      X      |             |             |
+//!    | RGB    --> RGB    |      X      |      X      |      X      |      X      |
+//!    | RGB    --> RGBA   |      X      |      X      |      X      |      X      |
+//!    | RGBA   --> Y      |      X      |      X      |             |             |
+//!    | RGBA   --> A      |      X      |             |             |             |
+//!    | RGBA   --> RGB    |      X      |      X      |      X      |      X      |
+//!    | RGBA   --> RGBA   |      X      |      X      |      X      |      X      |
+//!    | RGB    --> YUV420 |      X      |             |      X      |             |
+//!    | RGBA   --> YUV420 |      X      |             |      X      |             |
+//!    | RGB    --> YUV422 |      X      |             |      X      |             |
+//!    | RGBA   --> YUV422 |      X      |             |      X      |             |
+//!    | RGB    --> YUV444 |      X      |             |      X      |             |
+//!    | RGBA   --> YUV444 |      X      |             |      X      |             |
+//!    | YUV420 --> RGB    |      X      |      X      |             |             |
+//!    | YUV420 --> RGBA   |      X      |      X      |             |             |
+//!    | YUV422 --> RGB    |      X      |      X      |             |             |
+//!    | YUV422 --> RGBA   |      X      |      X      |             |             |
+//!    | YUV444 --> RGB    |      X      |      X      |             |             |
+//!    | YUV444 --> RGBA   |      X      |      X      |             |             |
+//!    +-------------------+-------------+-------------+-------------+-------------+
+//! where
+//! * Either source or destination can be CHUNKY or PLANAR.
+//! * Either source or destination can reside on the CPU or the GPU.
+//! * The RGB components are in any order (i.e. RGB or BGR; RGBA or BGRA).
+//! * For RGBA (or BGRA) destinations, most implementations do not change the alpha channel, so it is recommended to
+//!   set it at initialization time with [cuda]memset(im.pixels, -1, im.pitch * im.height) or
+//!   [cuda]memset(im.pixels, -1, im.pitch * im.height * im.numComponents) for chunky and planar images respectively.
+//! * YUV requires that the colorspace field be set manually prior to Transfer, e.g. typical for layout=NVCV_NV12:
+//!   image.colorspace = NVCV_709 | NVCV_VIDEO_RANGE | NVCV_CHROMA_INTSTITIAL;
+//! * There are also RGBf16-->RGBf32 and RGBf32-->RGBf16 transfers.
+//! * Additionally, when the src and dst formats are the same, all formats are accommodated on CPU and GPU,
+//!   and this can be used as a replacement for cudaMemcpy2DAsync() (which it utilizes). This is also true for YUV,
+//!   whose src and dst must share the same format, layout and colorspace.
+//!
+//! When there is some kind of conversion AND the src and dst reside on different processors (CPU, GPU),
+//! it is necessary to have a temporary GPU buffer, which is reshaped as needed to match the characteristics
+//! of the CPU image. The same temporary image can be used in subsequent calls to NvCVImage_Transfer(),
+//! regardless of the shape, format or component type, as it will grow as needed to accommodate
+//! the largest memory requirement. The recommended usage for most cases is to supply an empty image
+//! as the temporary; if it is not needed, no buffer is allocated. NULL can be supplied as the tmp
+//! image, in which case an ephemeral buffer is allocated if needed, with resultant
+//! performance degradation for image sequences.
+//!
+//! \param[in]      src     the source image.
+//! \param[out]     dst     the destination image.
+//! \param[in]      scale   a scale factor that can be applied when one (but not both) of the images
+//!                         is based on floating-point components; this parameter is ignored when all image components
+//!                         are represented with integer data types, or all image components are represented with
+//!                         floating-point data types.
+//! \param[in]      stream  the stream on which to perform the copy. This is ignored if both images reside on the CPU.
+//! \param[in,out]  tmp     a temporary buffer that is sometimes needed when transferring images
+//!                         between the CPU and GPU in either direction (can be empty or NULL).
+//!                         It has the same characteristics as the CPU image, but it resides on the GPU.
+//! \return         NVCV_SUCCESS           if successful,
+//!                 NVCV_ERR_PIXELFORMAT   if one of the pixel formats is not accommodated.
+//!                 NVCV_ERR_CUDA          if a CUDA error has occurred.
+//!                 NVCV_ERR_GENERAL       if an otherwise unspecified error has occurred.
+NvCV_Status NvCV_API NvCVImage_Transfer(
+             const NvCVImage *src, NvCVImage *dst, float scale, struct CUstream_st *stream, NvCVImage *tmp);
+
+
+//! Transfer a rectangular portion of an image.
+//! See NvCVImage_Transfer() for the pixel format combinations that are implemented.
+//! \param[in]  src     the source image.
+//! \param[in]  srcRect the subRect of the src to be transferred (NULL implies the whole image).
+//! \param[out] dst     the destination image.
+//! \param[in]  dstPt   location to which the srcRect is to be copied (NULL implies (0,0)).
+//! \param[in]  scale   scale factor applied to the magnitude during transfer, typically 1, 255 or 1/255.
+//! \param[in]  stream  the CUDA stream.
+//! \param[in]  tmp     a staging image.
+//! \return     NVCV_SUCCESS  if the operation was completed successfully.
+//! \note       The actual transfer region may be smaller, because the rects are clipped against the images.
+NvCV_Status NvCV_API NvCVImage_TransferRect(
+  const NvCVImage *src, const NvCVRect2i *srcRect, NvCVImage *dst, const NvCVPoint2i *dstPt,
+  float scale, struct CUstream_st *stream, NvCVImage *tmp);
+
+
+//! Transfer from a YUV image.
+//! YUVu8 --> RGBu8 and YUVu8 --> RGBf32 are currently available.
+//! \param[in]  y             pointer to pixel(0,0) of the luminance channel.
+//! \param[in]  yPixBytes     the byte stride between y pixels horizontally.
+//! \param[in]  yPitch        the byte stride between y pixels vertically.
+//! \param[in]  u             pointer to pixel(0,0) of the u (Cb) chrominance channel.
+//! \param[in]  v             pointer to pixel(0,0) of the v (Cr) chrominance channel.
+//! \param[in]  uvPixBytes    the byte stride between u or v pixels horizontally.
+//! \param[in]  uvPitch       the byte stride between u or v pixels vertically.
+//! \param[in]  yuvColorSpace the yuv colorspace, specifying range, chromaticities, and chrominance phase.
+//! \param[in]  yuvMemSpace   the memory space where the pixel buffers reside.
+//! \param[out] dst           the destination image.
+//! \param[in]  dstRect       the destination rectangle (NULL implies the whole image).
+//! \param[in]  scale         scale factor applied to the magnitude during transfer, typically 1, 255 or 1/255.
+//! \param[in]  stream        the CUDA stream.
+//! \param[in]  tmp           a staging image.
+//! \return     NVCV_SUCCESS  if the operation was completed successfully.
+//! \note       The actual transfer region may be smaller, because the rects are clipped against the images.
+//! \note       This is supplied for use with YUV buffers that do not have the standard structure
+//!             that are expected for NvCVImage_Transfer() and NvCVImage_TransferRect.
+NvCV_Status NvCV_API NvCVImage_TransferFromYUV(
+  const void *y,                int yPixBytes,  int yPitch,
+  const void *u, const void *v, int uvPixBytes, int uvPitch,
+  NvCVImage_PixelFormat yuvFormat, NvCVImage_ComponentType yuvType,
+  unsigned yuvColorSpace, unsigned yuvMemSpace,
+  NvCVImage *dst, const NvCVRect2i *dstRect, float scale, struct CUstream_st *stream, NvCVImage *tmp);
+
+
+//! Transfer to a YUV image.
+//! RGBu8 --> YUVu8 and RGBf32 --> YUVu8 are currently available.
+//! \param[in]  src           the source image.
+//! \param[in]  srcRect       the destination rectangle (NULL implies the whole image).
+//! \param[out] y             pointer to pixel(0,0) of the luminance channel.
+//! \param[in]  yPixBytes     the byte stride between y pixels horizontally.
+//! \param[in]  yPitch        the byte stride between y pixels vertically.
+//! \param[out] u             pointer to pixel(0,0) of the u (Cb) chrominance channel.
+//! \param[out] v             pointer to pixel(0,0) of the v (Cr) chrominance channel.
+//! \param[in]  uvPixBytes    the byte stride between u or v pixels horizontally.
+//! \param[in]  uvPitch       the byte stride between u or v pixels vertically.
+//! \param[in]  yuvColorSpace the yuv colorspace, specifying range, chromaticities, and chrominance phase.
+//! \param[in]  yuvMemSpace   the memory space where the pixel buffers reside.
+//! \param[in]  scale         scale factor applied to the magnitude during transfer, typically 1, 255 or 1/255.
+//! \param[in]  stream        the CUDA stream.
+//! \param[in]  tmp           a staging image.
+//! \return     NVCV_SUCCESS  if the operation was completed successfully.
+//! \note       The actual transfer region may be smaller, because the rects are clipped against the images.
+//! \note       This is supplied for use with YUV buffers that do not have the standard structure
+//!             that are expected for NvCVImage_Transfer() and NvCVImage_TransferRect.
+NvCV_Status NvCV_API NvCVImage_TransferToYUV(
+  const NvCVImage *src, const NvCVRect2i *srcRect, 
+  const void *y,                int yPixBytes,  int yPitch,
+  const void *u, const void *v, int uvPixBytes, int uvPitch,
+  NvCVImage_PixelFormat yuvFormat, NvCVImage_ComponentType yuvType,
+  unsigned yuvColorSpace, unsigned yuvMemSpace,
+  float scale, struct CUstream_st *stream, NvCVImage *tmp);
+
+
+//! Between rendering by a graphics system and Transfer by CUDA, it is necessary to map the texture resource.
+//! There is a fair amount of overhead, so its use should be minimized.
+//! Every call to NvCVImage_MapResource() should be matched by a subsequent call to NvCVImage_UnmapResource().
+//! \param[in,out]  im      the image to be mapped.
+//! \param[in]      stream  the stream on which the mapping is to be performed.
+//! \return         NVCV_SUCCESS is the operation was completed successfully.
+//! \note           This is an experimental API. If you find it useful, please respond to XXX@YYY.com,
+//!                 otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_MapResource(NvCVImage *im, struct CUstream_st *stream);
+
+
+//! After transfer by CUDA, the texture resource must be unmapped in order to be used by the graphics system again.
+//! There is a fair amount of overhead, so its use should be minimized.
+//! Every call to NvCVImage_UnmapResource() should correspond to a preceding call to NvCVImage_MapResource().
+//! \param[in,out]  im      the image to be mapped.
+//! \param[in]      stream  the CUDA stream on which the mapping is to be performed.
+//! \return         NVCV_SUCCESS is the operation was completed successfully.
+//! \note           This is an experimental API. If you find it useful, please respond to XXX@YYY.com,
+//!                 otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_UnmapResource(NvCVImage *im, struct CUstream_st *stream);
+
+
+//! Composite one source image over another using the given matte.
+//! This accommodates all RGB and RGBA formats, with u8 and f32 components.
+//! If the bg has alpha, then the dst alpha is updated for use in subsequent composition.
+//! \param[in]  fg      the foreground source image.
+//! \param[in]  bg      the background source image.
+//! \param[in]  mat     the matte  Yu8   (or Au8)   image, indicating where the src should come through.
+//! \param[out] dst     the destination image. This can be the same as fg or bg.
+//! \param[in]  stream  the CUDA stream on which the composition is to be performed.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MISMATCH    if either the fg & bg & dst formats do not match, or if fg & bg & dst & mat are not
+//!                              in the same address space (CPU or GPU).
+#if RTX_CAMERA_IMAGE == 0
+NvCV_Status NvCV_API NvCVImage_Composite(const NvCVImage *fg, const NvCVImage *bg, const NvCVImage *mat, NvCVImage *dst,
+            struct CUstream_st *stream);
+#else // RTX_CAMERA_IMAGE == 1  // No GPU acceleration
+NvCV_Status NvCV_API NvCVImage_Composite(const NvCVImage *fg, const NvCVImage *bg, const NvCVImage *mat, NvCVImage *dst);
+#endif // RTX_CAMERA_IMAGE == 1
+
+
+//! Composite one source image rectangular region over another using the given matte.
+//! This accommodates all RGB and RGBA formats, with u8 and f32 components.
+//! If the bg has alpha, then the dst alpha is updated for use in subsequent composition.
+//! If the background is not opaque, it is recommended that all images be premultiplied by alpha,
+//! and mode 1 composition be used, to yield the most meaningful composite matte.
+//! \param[in]      fg      the foreground source image.
+//! \param[in]      fgOrg   the upper-left corner of the fg image to be composited (NULL implies (0,0)).
+//! \param[in]      bg      the background source image.
+//! \param[in]      bgOrg   the upper-left corner of the bg image to be composited (NULL implies (0,0)).
+//! \param[in]      mat     the matte image, indicating where the src should come through.
+//!                         This determines the size of the rectangle to be composited.
+//!                         If this is multi-channel, the alpha channel is used as the matte.
+//! \param[in]      mode    the composition mode: 0 (straight alpha over) or 1 (premultiplied alpha over).
+//! \param[out]     dst     the destination image. This can be the same as fg or bg.
+//! \param[in]      dstOrg  the upper-left corner of the dst image to be updated (NULL implies (0,0)).
+//! \param[in]      stream  the CUDA stream on which the composition is to be performed.
+//! \note   If a smaller region of a matte is desired, a window can be created using
+//!         NvCVImage_InitView() for chunky pixels, as illustrated below in NvCVImage_CompositeRectA().
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MISMATCH    if either the fg & bg & dst formats do not match, or if fg & bg & dst & mat are not
+//!                              in the same address space (CPU or GPU).
+NvCV_Status NvCV_API NvCVImage_CompositeRect(
+    const NvCVImage *fg,  const NvCVPoint2i *fgOrg,
+    const NvCVImage *bg,  const NvCVPoint2i *bgOrg,
+    const NvCVImage *mat, unsigned mode,
+    NvCVImage       *dst, const NvCVPoint2i *dstOrg,
+    struct CUstream_st *stream);
+
+
+//! Composite one RGBA or BGRA source image rectangular region over another RGB, BGR, RGBA or BGRA region.
+//! This accommodates all RGB and RGBA formats, with u8 and f32 components.
+//! If the bg has alpha, then the dst alpha is updated for use in subsequent composition.
+//! If the background is not opaque, it is recommended that all images be premultiplied by alpha,
+//! and mode 1 composition be used, to yield the most meaningful composite matte.
+//! \param[in]      fg      the foreground RGBA or BGRA source image.
+//! \param[in]      fgRect  a sub-rectangle of the fg image (NULL implies the whole image).
+//! \param[in]      bg      the background source image.
+//! \param[in]      bgOrg   the upper-left corner of the bg image to be composited (NULL implies (0,0)).
+//! \param[in]      mode    the composition mode: 0 (straight alpha over) or 1 (premultiplied alpha over).
+//! \param[out]     dst     the destination image. This can be the same as fg or bg.
+//! \param[in]      dstOrg  the upper-left corner of the dst image to be updated (NULL implies (0,0)).
+//! \param[in]      stream  the CUDA stream on which the composition is to be performed.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MISMATCH    if either the fg & bg & dst formats do not match, or if fg & bg & dst & mat are not
+//!                              in the same address space (CPU or GPU).
+//! \bug  fgRect will only work for chunky images, not planar.
+#ifdef __cplusplus
+inline NvCV_Status NvCVImage_CompositeRectA(
+    const NvCVImage *fg, const NvCVRect2i  *fgRect,
+    const NvCVImage *bg, const NvCVPoint2i *bgOrg,
+    unsigned mode,
+    NvCVImage       *dst, const NvCVPoint2i *dstOrg,
+    struct CUstream_st *stream
+) {
+  if (fgRect) {
+    NvCVImage fgView(const_cast<NvCVImage*>(fg), fgRect->x, fgRect->y, fgRect->width, fgRect->height);
+    return NvCVImage_CompositeRect(&fgView, nullptr, bg, bgOrg, &fgView, mode, dst, dstOrg, stream);
+  }
+  return NvCVImage_CompositeRect(fg, nullptr, bg, bgOrg, fg, mode, dst, dstOrg, stream);
+}
+#endif // __cplusplus
+
+
+//! Composite a source image over a constant color field using the given matte.
+//! \param[in]      src     the source image.
+//! \param[in]      mat     the matte  image, indicating where the src should come through.
+//! \param[in]      bgColor pointer to a location holding the desired flat background color, with the same format
+//!                         and component ordering as the dst. This acts as a 1x1 background pixel buffer,
+//!                         so should reside in the same memory space (CUDA or CPU) as the other buffers.
+//! \param[in,out]  dst     the destination image. May be the same as src.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MISMATCH    if fg & mat & dst & bgColor are not in the same address space (CPU or GPU).
+//! \note   The bgColor must remain valid until complete; this is an important consideration especially if
+//!         the buffers are on the GPU and NvCVImage_CompositeOverConstant() runs asynchronously.
+NvCV_Status NvCV_API NvCVImage_CompositeOverConstant(
+#if RTX_CAMERA_IMAGE == 0
+    const NvCVImage *src, const NvCVImage *mat, const void *bgColor, NvCVImage *dst, struct CUstream_st *stream
+#else // RTX_CAMERA_IMAGE == 1
+    const NvCVImage *src, const NvCVImage *mat, const unsigned char bgColor[3], NvCVImage *dst
+#endif // RTX_CAMERA_IMAGE
+);
+
+
+//! Flip the image vertically.
+//! No actual pixels are moved: it is just an accounting procedure.
+//! This is extremely low overhead, but requires appropriate interpretation of the pitch.
+//! Flipping twice yields the original orientation.
+//! \param[in]  src  the source image (NULL implies src == dst).
+//! \param[out] dst  the flipped image (can be the same as the src).
+//! \return     NVCV_SUCCESS         if successful.
+//! \return     NVCV_ERR_PIXELFORMAT for most planar formats.
+//! \bug        This does not work for planar or semi-planar formats, neither RGB nor YUV.
+//! \note       This does work for all chunky formats, including UYVY, VYUY, YUYV, YVYU.
+NvCV_Status NvCV_API NvCVImage_FlipY(const NvCVImage *src, NvCVImage *dst);
+
+
+//! Get the pointers for YUV, based on the format.
+//! \param[in]  im          The image to be deconstructed.
+//! \param[out] y           A place to store the pointer to y(0,0).
+//! \param[out] u           A place to store the pointer to u(0,0).
+//! \param[out] v           A place to store the pointer to v(0,0).
+//! \param[out] yPixBytes   A place to store the byte stride between  luma  samples horizontally.
+//! \param[out] cPixBytes   A place to store the byte stride between chroma samples horizontally.
+//! \param[out] yRowBytes   A place to store the byte stride between  luma  samples vertically.
+//! \param[out] cRowBytes   A place to store the byte stride between chroma samples vertically.
+//! \return     NVCV_SUCCESS           If the information was gathered successfully.
+//!             NVCV_ERR_PIXELFORMAT   Otherwise.
+NvCV_Status NvCV_API NvCVImage_GetYUVPointers(NvCVImage *im,
+  unsigned char **y, unsigned char **u, unsigned char **v,
+  int *yPixBytes, int *cPixBytes, int *yRowBytes, int *cRowBytes);
+
+
+//! Sharpen an image.
+//! The src and dst should be the same type - conversions are not performed.
+//! This function is only implemented for NVCV_CHUNKY NVCV_U8 pixels, of format NVCV_RGB or NVCV_BGR.
+//! \param[in]  sharpness the sharpness strength, calibrated so that 1 and 2 yields Adobe's Sharpen and Sharpen More.
+//! \param[in]  src       the source image to be sharpened.
+//! \param[out] dst       the resultant image (may be the same as the src).
+//! \param[in]  stream    the CUDA stream on which to perform the computations.
+//! \param[in]  tmp       a temporary working image. This can be NULL, but may result in lower performance.
+//!                       It is best if it resides on the same processor (CPU or GPU) as the destination.
+//! @return     NVCV_SUCCESS          if the operation completed successfully.
+//!             NVCV_ERR_MISMATCH     if the source and destination formats are different.
+//!             NVCV_ERR_PIXELFORMAT  if the function has not been implemented for the chosen pixel type.
+
+NvCV_Status NvCV_API NvCVImage_Sharpen(float sharpness, const NvCVImage *src, NvCVImage *dst,
+  struct CUstream_st *stream, NvCVImage *tmp);
+
+
+#ifdef __cplusplus
+} // extern "C"
+
+/********************************************************************************
+ * NvCVImage default constructor
+ ********************************************************************************/
+
+NvCVImage::NvCVImage() {
+  pixels = nullptr;
+  (void)NvCVImage_Alloc(this, 0, 0, NVCV_FORMAT_UNKNOWN, NVCV_TYPE_UNKNOWN, 0, 0, 0);
+}
+
+/********************************************************************************
+ * NvCVImage allocation constructor
+ ********************************************************************************/
+
+NvCVImage::NvCVImage(unsigned width, unsigned height, NvCVImage_PixelFormat format, NvCVImage_ComponentType type,
+                       unsigned layout, unsigned memSpace, unsigned alignment) {
+  pixels = nullptr;
+  (void)NvCVImage_Alloc(this, width, height, format, type, layout, memSpace, alignment);
+}
+
+/********************************************************************************
+ * Subimage constructor
+ ********************************************************************************/
+
+NvCVImage::NvCVImage(NvCVImage *fullImg, int x, int y, unsigned width, unsigned height) {
+  NvCVImage_InitView(this, fullImg, x, y, width, height);
+}
+
+/********************************************************************************
+ * NvCVImage destructor
+ ********************************************************************************/
+
+NvCVImage::~NvCVImage() { NvCVImage_Dealloc(this); }
+
+/********************************************************************************
+ * copy subimage
+ ********************************************************************************/
+
+NvCV_Status NvCVImage::copyFrom(const NvCVImage *src, int srcX, int srcY, int dstX, int dstY, unsigned wd,
+                                  unsigned ht, struct CUstream_st* stream) {
+#if RTX_CAMERA_IMAGE // This only works for chunky images
+  NvCVImage srcView, dstView;
+  NvCVImage_InitView(&srcView, const_cast<NvCVImage *>(src), srcX, srcY, wd, ht);
+  NvCVImage_InitView(&dstView, this, dstX, dstY, wd, ht);
+  return NvCVImage_Transfer(&srcView, &dstView, 1.f, stream, nullptr);
+#else // !RTX_CAMERA_IMAGE bug fix for non-chunky images
+  NvCVRect2i  srcRect = { (int)srcX, (int)srcY, (int)wd, (int)ht };
+  NvCVPoint2i dstPt   = { (int)dstX, (int)dstY };
+  return NvCVImage_TransferRect(src, &srcRect, this, &dstPt, 1.f, stream, nullptr);
+#endif // RTX_CAMERA_IMAGE
+}
+
+/********************************************************************************
+ * copy image
+ ********************************************************************************/
+
+NvCV_Status NvCVImage::copyFrom(const NvCVImage *src, struct CUstream_st* stream) {
+  return NvCVImage_Transfer(src, this, 1.f, stream, nullptr);
+}
+
+
+#endif // ___cplusplus
+
+#endif // __NVCVIMAGE_H__
--- a/nvvfx/include/nvCVStatus.h
+++ b/nvvfx/include/nvCVStatus.h
@ -0,0 +1,121 @@
+/*###############################################################################
+#
+# Copyright 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#ifndef __NVCVSTATUS_H__
+#define __NVCVSTATUS_H__
+
+#ifndef NvCV_API
+  #ifdef _WIN32
+    #ifdef NVCV_API_EXPORT
+      #define NvCV_API __declspec(dllexport) __cdecl
+    #else
+      #define NvCV_API
+    #endif
+  #else //if linux
+    #define NvCV_API   // TODO: Linux code goes here
+  #endif // _WIN32 or linux
+#endif //NvCV_API
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif // ___cplusplus
+
+
+//! Status codes returned from APIs.
+typedef enum NvCV_Status {
+  NVCV_SUCCESS                   = 0,    //!< The procedure returned successfully.
+  NVCV_ERR_GENERAL               = -1,   //!< An otherwise unspecified error has occurred.
+  NVCV_ERR_UNIMPLEMENTED         = -2,   //!< The requested feature is not yet implemented.
+  NVCV_ERR_MEMORY                = -3,   //!< There is not enough memory for the requested operation.
+  NVCV_ERR_EFFECT                = -4,   //!< An invalid effect handle has been supplied.
+  NVCV_ERR_SELECTOR              = -5,   //!< The given parameter selector is not valid in this effect filter.
+  NVCV_ERR_BUFFER                = -6,   //!< An image buffer has not been specified.
+  NVCV_ERR_PARAMETER             = -7,   //!< An invalid parameter value has been supplied for this effect+selector.
+  NVCV_ERR_MISMATCH              = -8,   //!< Some parameters are not appropriately matched.
+  NVCV_ERR_PIXELFORMAT           = -9,   //!< The specified pixel format is not accommodated.
+  NVCV_ERR_MODEL                 = -10,  //!< Error while loading the TRT model.
+  NVCV_ERR_LIBRARY               = -11,  //!< Error loading the dynamic library.
+  NVCV_ERR_INITIALIZATION        = -12,  //!< The effect has not been properly initialized.
+  NVCV_ERR_FILE                  = -13,  //!< The file could not be found.
+  NVCV_ERR_FEATURENOTFOUND       = -14,  //!< The requested feature was not found
+  NVCV_ERR_MISSINGINPUT          = -15,  //!< A required parameter was not set
+  NVCV_ERR_RESOLUTION            = -16,  //!< The specified image resolution is not supported.
+  NVCV_ERR_UNSUPPORTEDGPU        = -17,  //!< The GPU is not supported
+  NVCV_ERR_WRONGGPU              = -18,  //!< The current GPU is not the one selected.
+  NVCV_ERR_UNSUPPORTEDDRIVER     = -19,  //!< The currently installed graphics driver is not supported
+  NVCV_ERR_MODELDEPENDENCIES     = -20,  //!< There is no model with dependencies that match this system
+  NVCV_ERR_PARSE                 = -21,  //!< There has been a parsing or syntax error while reading a file
+  NVCV_ERR_MODELSUBSTITUTION     = -22,  //!< The specified model does not exist and has been substituted.
+  NVCV_ERR_READ                  = -23,  //!< An error occurred while reading a file.
+  NVCV_ERR_WRITE                 = -24,  //!< An error occurred while writing a file.
+  NVCV_ERR_PARAMREADONLY         = -25,  //!< The selected parameter is read-only.
+  NVCV_ERR_TRT_ENQUEUE           = -26,  //!< TensorRT enqueue failed.
+  NVCV_ERR_TRT_BINDINGS          = -27,  //!< Unexpected TensorRT bindings.
+  NVCV_ERR_TRT_CONTEXT           = -28,  //!< An error occurred while creating a TensorRT context.
+  NVCV_ERR_TRT_INFER             = -29,  ///< The was a problem creating the inference engine.
+  NVCV_ERR_TRT_ENGINE            = -30,  ///< There was a problem deserializing the inference runtime engine.
+  NVCV_ERR_NPP                   = -31,  //!< An error has occurred in the NPP library.
+  NVCV_ERR_CONFIG                = -32,  //!< No suitable model exists for the specified parameter configuration.
+  NVCV_ERR_TOOSMALL              = -33,  //!< A supplied parameter or buffer is not large enough.
+  NVCV_ERR_TOOBIG                = -34,  //!< A supplied parameter is too big.
+  NVCV_ERR_WRONGSIZE             = -35,  //!< A supplied parameter is not the expected size.
+  NVCV_ERR_OBJECTNOTFOUND        = -36,  //!< The specified object was not found.
+  NVCV_ERR_SINGULAR              = -37,  //!< A mathematical singularity has been encountered.
+  NVCV_ERR_NOTHINGRENDERED       = -38,  //!< Nothing was rendered in the specified region.
+  NVCV_ERR_CONVERGENCE           = -39,  //!< An iteration did not converge satisfactorily.
+
+  NVCV_ERR_OPENGL                = -98,  //!< An OpenGL error has occurred.
+  NVCV_ERR_DIRECT3D              = -99,  //!< A Direct3D error has occurred.
+
+  NVCV_ERR_CUDA_BASE             = -100,  //!< CUDA errors are offset from this value.
+  NVCV_ERR_CUDA_VALUE            = -101,  //!< A CUDA parameter is not within the acceptable range.
+  NVCV_ERR_CUDA_MEMORY           = -102,  //!< There is not enough CUDA memory for the requested operation.
+  NVCV_ERR_CUDA_PITCH            = -112,  //!< A CUDA pitch is not within the acceptable range.
+  NVCV_ERR_CUDA_INIT             = -127,  //!< The CUDA driver and runtime could not be initialized.
+  NVCV_ERR_CUDA_LAUNCH           = -819,  //!< The CUDA kernel launch has failed.
+  NVCV_ERR_CUDA_KERNEL           = -309,  //!< No suitable kernel image is available for the device.
+  NVCV_ERR_CUDA_DRIVER           = -135,  //!< The installed NVIDIA CUDA driver is older than the CUDA runtime library.
+  NVCV_ERR_CUDA_UNSUPPORTED      = -901,  //!< The CUDA operation is not supported on the current system or device.
+  NVCV_ERR_CUDA_ILLEGAL_ADDRESS  = -800,  //!< CUDA tried to load or store on an invalid memory address.
+  NVCV_ERR_CUDA                  = -1099, //!< An otherwise unspecified CUDA error has been reported.
+} NvCV_Status;
+
+
+//! Get an error string corresponding to the given status code.
+//! \param[in]  code  the NvCV status code.
+//! \return     the corresponding string.
+//! \todo Find a cleaner way to do this, because NvCV_API doesn't work.
+#ifdef _WIN32
+  __declspec(dllexport) const char* __cdecl
+#else
+  const char* 
+#endif // _WIN32 or linux
+NvCV_GetErrorStringFromCode(NvCV_Status code);
+
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // __NVCVSTATUS_H__
--- a/nvvfx/include/nvTransferD3D.h
+++ b/nvvfx/include/nvTransferD3D.h
@ -0,0 +1,76 @@
+/*###############################################################################
+#
+# Copyright(c) 2021 NVIDIA CORPORATION.All Rights Reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+###############################################################################*/
+
+#ifndef __NVTRANSFER_D3D_H__
+#define __NVTRANSFER_D3D_H__
+
+#ifndef _WINDOWS_
+  #define WIN32_LEAN_AND_MEAN
+  #include <Windows.h>
+#endif // _WINDOWS_
+#include <dxgitype.h>
+#include "nvCVImage.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // ___cplusplus
+
+
+
+//! Utility to determine the D3D format from the NvCVImage format, type and layout.
+//! \param[in]  format    the pixel format.
+//! \param[in]  type      the component type.
+//! \param[in]  layout    the layout.
+//! \param[out] d3dFormat a place to store the corresponding D3D format.
+//! \return     NVCV_SUCCESS if successful.
+//! \note       This is an experimental API. If you find it useful, please respond to XXX@YYY.com, otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_ToD3DFormat(NvCVImage_PixelFormat format, NvCVImage_ComponentType type, unsigned layout, DXGI_FORMAT *d3dFormat);
+
+
+//! Utility to determine the NvCVImage format, component type and layout from a D3D format.
+//! \param[in]  d3dFormat the D3D format to translate.
+//! \param[out] format    a place to store the NvCVImage pixel format.
+//! \param[out] type      a place to store the NvCVImage component type.
+//! \param[out] layout    a place to store the NvCVImage layout.
+//! \return     NVCV_SUCCESS if successful.
+//! \note       This is an experimental API. If you find it useful, please respond to XXX@YYY.com, otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_FromD3DFormat(DXGI_FORMAT d3dFormat, NvCVImage_PixelFormat *format, NvCVImage_ComponentType *type, unsigned char *layout);
+
+
+#ifdef __dxgicommon_h__
+
+//! Utility to determine the D3D color space from the NvCVImage color space.
+//! \param[in]  nvcvColorSpace  the NvCVImage colro space.
+//! \param[out] pD3dColorSpace  a place to store the resultant D3D color space.
+//! \return     NVCV_SUCCESS          if successful.
+//! \return     NVCV_ERR_PIXELFORMAT  if there is no equivalent color space.
+//! \note       This is an experimental API. If you find it useful, please respond to XXX@YYY.com, otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_ToD3DColorSpace(unsigned char nvcvColorSpace, DXGI_COLOR_SPACE_TYPE *pD3dColorSpace);
+
+
+//! Utility to determine the NvCVImage color space from the D3D color space.
+//! \param[in]  d3dColorSpace   the D3D color space.
+//! \param[out] pNvcvColorSpace a place to store the resultant NvCVImage color space.
+//! \return     NVCV_SUCCESS          if successful.
+//! \return     NVCV_ERR_PIXELFORMAT  if there is no equivalent color space.
+//! \note       This is an experimental API. If you find it useful, please respond to XXX@YYY.com, otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_FromD3DColorSpace(DXGI_COLOR_SPACE_TYPE d3dColorSpace, unsigned char *pNvcvColorSpace);
+
+#endif // __dxgicommon_h__
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // __NVTRANSFER_D3D_H__
+
--- a/nvvfx/include/nvTransferD3D11.h
+++ b/nvvfx/include/nvTransferD3D11.h
@ -0,0 +1,45 @@
+/*###############################################################################
+#
+# Copyright(c) 2021 NVIDIA CORPORATION.All Rights Reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+###############################################################################*/
+
+#ifndef __NVTRANSFER_D3D11_H__
+#define __NVTRANSFER_D3D11_H__
+
+#include <d3d11.h>
+#include "nvCVImage.h"
+#include "nvTransferD3D.h"  // for NvCVImage_ToD3DFormat() and NvCVImage_FromD3DFormat()
+
+#ifdef __cplusplus
+extern "C" {
+#endif // ___cplusplus
+
+
+
+//! Initialize an NvCVImage from a D3D11 texture.
+//! The pixelFormat and component types with be transferred over, and a cudaGraphicsResource will be registered;
+//! the NvCVImage destructor will unregister the resource.
+//! It is necessary to call NvCVImage_MapResource() after rendering D3D and before calling  NvCVImage_Transfer(),
+//! and to call NvCVImage_UnmapResource() before rendering in D3D again.
+//! \param[in,out]  im  the image to be initialized.
+//! \param[in]      tx  the texture to be used for initialization.
+//! \return         NVCV_SUCCESS if successful.
+//! \note           This is an experimental API. If you find it useful, please respond to XXX@YYY.com,
+//!                 otherwise we may drop support.
+/* EXPERIMENTAL */ NvCV_Status NvCV_API NvCVImage_InitFromD3D11Texture(NvCVImage *im, struct ID3D11Texture2D *tx);
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // __NVTRANSFER_D3D11_H__
+
--- a/nvvfx/include/nvVideoEffects.h
+++ b/nvvfx/include/nvVideoEffects.h
@ -0,0 +1,261 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#ifndef __NVVIDEO_EFFECTS_H__
+#define __NVVIDEO_EFFECTS_H__
+
+#include "nvCVImage.h"
+
+#ifndef NvVFX_API
+  #ifdef _WIN32
+    #ifdef NVVFX_API_EXPORT
+      #define NvVFX_API __declspec(dllexport) __cdecl
+    #else
+      #define NvVFX_API
+    #endif
+  #else //if linux
+    #define NvVFX_API   // TODO: Linux code goes here
+  #endif // _WIN32 or linux
+#endif //NvVFX_API
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Forward declaration for CUDA API
+typedef struct CUstream_st* CUstream;
+
+//! We use strings as effect selectors.
+typedef const char* NvVFX_EffectSelector;
+
+//! We use strings as parameter selectors.
+typedef const char* NvVFX_ParameterSelector;
+
+//! Each effect instantiation is associated with an opaque handle.
+struct NvVFX_Object;
+typedef struct NvVFX_Object NvVFX_Object, *NvVFX_Handle;
+
+///! 
+///! Effect may use this handle to manage state objects.
+///! 
+struct NvVFX_StateObjectHandleBase;
+typedef struct NvVFX_StateObjectHandleBase* NvVFX_StateObjectHandle;
+
+//! Get the SDK version
+//! \param[in,out]  version    Pointer to an unsigned int set to 
+//!                            (major << 24) | (minor << 16) | (build << 8) | 0
+//! \return         NVCV_SUCCESS  if the version was set
+//! \return         NVCV_ERR_PARAMETER  if version was NULL
+NvCV_Status NvVFX_API NvVFX_GetVersion(unsigned int *version);
+
+//! Create an new instantiation of a video effect.
+//! \param[in]  code    the selector code for the desired video effect.
+//! \param[out] effect  a handle to the Video Effect instantiation.
+//! \return     NVCV_SUCCESS   if the operation was successful.
+NvCV_Status NvVFX_API NvVFX_CreateEffect(NvVFX_EffectSelector code, NvVFX_Handle *effect);
+
+
+//! Delete a previously allocated video effect.
+//! \param[in]  effect a handle to the video effect to be deleted.
+void NvVFX_API NvVFX_DestroyEffect(NvVFX_Handle effect);
+
+
+//! Set the value of the selected parameter (unsigned int, int, float double, unsigned long long, void*, CUstream).
+//! \param[in,out]  effect      The effect to configure.
+//! \param[in]      paramName   The selector of the effect parameter to configure.
+//! \param[in]      val         The value to be assigned to the selected effect parameter.
+//! \return NVCV_SUCCESS       if the operation was successful.
+//! \return NVCV_ERR_EFFECT    if an invalid effect handle was supplied.
+//! \return NVCV_ERR_SELECTOR  if the chosen effect does not understand the specified selector and data type.
+//! \return NVCV_ERR_PARAMETER if the value was out of range.
+NvCV_Status NvVFX_API NvVFX_SetU32(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, unsigned int val);
+NvCV_Status NvVFX_API NvVFX_SetS32(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, int val);
+NvCV_Status NvVFX_API NvVFX_SetF32(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, float val);
+NvCV_Status NvVFX_API NvVFX_SetF64(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, double val);
+NvCV_Status NvVFX_API NvVFX_SetU64(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, unsigned long long val);
+NvCV_Status NvVFX_API NvVFX_SetObject(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, void *ptr);
+NvCV_Status NvVFX_API NvVFX_SetStateObjectHandleArray(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, NvVFX_StateObjectHandle* handle);
+NvCV_Status NvVFX_API NvVFX_SetCudaStream(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, CUstream stream);
+
+//! Set the selected image descriptor.
+//! A shallow copy of the descriptor is made (preserving the pixel pointer), so that an ephemeral NvVFXImage_Init()
+//! wrapper may be used in the call to NvVFX_SetImage() if desired, without having to preserve it for the lifetime
+//! of the effect. The effect does not take ownership of the pixel buffer.
+//! \param[in,out]  effect      The effect to configure.
+//! \param[in]      paramName   The selector of the effect image to configure.
+//! \param[in]      im          Pointer to the image descriptor to be used for the selected effect image.
+//!                             NULL clears the selected internal image descriptor.
+//! \return NVCV_SUCCESS       if the operation was successful.
+//! \return NVCV_ERR_EFFECT    if an invalid effect handle was supplied.
+//! \return NVCV_ERR_SELECTOR  if the chosen effect does not understand the specified image selector.
+//! \return NVCV_ERR_PARAMETER if an unexpected NULL pointer was supplied.
+NvCV_Status NvVFX_API NvVFX_SetImage(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, NvCVImage *im);
+
+//! Set the value of the selected string, by making a copy in the effect handle.
+//! \param[in,out]  effect      The effect to configure.
+//! \param[in]      paramName   The selector of the effect string to configure.
+//! \param[in]      str         The value to be assigned to the selected effect string. NULL clears the selected string.
+//! \return NVCV_SUCCESS       if the operation was successful.
+//! \return NVCV_ERR_EFFECT    if an invalid effect handle was supplied.
+//! \return NVCV_ERR_SELECTOR  if the chosen effect does not understand the specified string selector.
+//! \return NVCV_ERR_PARAMETER if an unexpected NULL pointer was supplied.
+NvCV_Status NvVFX_API NvVFX_SetString(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, const char *str);
+
+
+//! Get the value of the selected parameter (unsigned int, int, float double, unsigned long long, void*, CUstream).
+//! These are not typically used except for testing.
+//! \param[in]  effect    the effect to be queried.
+//! \param[in]  paramName the selector of the effect parameter to retrieve.
+//! \param[out] val       a place to store the retrieved parameter.
+//! \return NVCV_SUCCESS       if the operation was successful.
+//! \return NVCV_ERR_EFFECT    if an invalid effect handle was supplied.
+//! \return NVCV_ERR_SELECTOR  if the chosen effect does not understand the specified selector and data type.
+//! \return NVCV_ERR_PARAMETER if an unexpected NULL pointer was supplied.
+//! \note Typically, these are not used outside of testing.
+NvCV_Status NvVFX_API NvVFX_GetU32(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, unsigned int *val);
+NvCV_Status NvVFX_API NvVFX_GetS32(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, int *val);
+NvCV_Status NvVFX_API NvVFX_GetF32(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, float *val);
+NvCV_Status NvVFX_API NvVFX_GetF64(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, double *val);
+NvCV_Status NvVFX_API NvVFX_GetU64(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, unsigned long long *val);
+NvCV_Status NvVFX_API NvVFX_GetObject(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, void **ptr);
+NvCV_Status NvVFX_API NvVFX_GetCudaStream(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, CUstream *stream);
+
+//! Get a copy of the selected image descriptor.
+//! If GetImage() is called before SetImage(), the returned descriptor will be filled with zeros.
+//! Otherwise, the values will be identical to that in the previous SetImage() call,
+//! with the exception of deletePtr, deleteProc and bufferBytes, which will be 0.
+//! \param[in]  effect    the effect to be queried.
+//! \param[in]  paramName the selector of the effect image to retrieve.
+//! \param[out] val       a place to store the selected image descriptor.
+//!                       A pointer to an empty NvCVImage (deletePtr==NULL) should be supplied to avoid memory leaks.
+//! \return NVCV_SUCCESS       if the operation was successful.
+//! \return NVCV_ERR_EFFECT    if an invalid effect handle was supplied.
+//! \return NVCV_ERR_SELECTOR  if the chosen effect does not understand the specified image selector.
+//! \return NVCV_ERR_PARAMETER if an unexpected NULL pointer was supplied.
+//! \note Typically, this is not used outside of testing.
+NvCV_Status NvVFX_API NvVFX_GetImage(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, NvCVImage *im);
+
+//! Get the specified string.
+//! If GetString() is called before SetString(), the returned string will be empty.
+//! Otherwise, the string will be identical to that in the previous SetString() call,
+//! though it will be stored in a different location.
+//! \param[in]  effect    the effect to be queried.
+//! \param[in]  paramName the selector of the effect string to retrieve.
+//! \param[out] val       a place to store a pointer to the selected string.
+//! \return NVCV_SUCCESS       if the operation was successful.
+//! \return NVCV_ERR_EFFECT    if an invalid effect handle was supplied.
+//! \return NVCV_ERR_SELECTOR  if the chosen effect does not understand the specified string selector.
+//! \return NVCV_ERR_PARAMETER if an unexpected NULL pointer was supplied.
+//! \note Typically, this is not used outside of testing.
+NvCV_Status NvVFX_API NvVFX_GetString(NvVFX_Handle effect, NvVFX_ParameterSelector paramName, const char **str);
+
+//! Run the selected effect.
+//! \param[in]  effect     the effect object handle.
+//! \param[in]  async   run the effect asynchronously if nonzero; otherwise run synchronously.
+//! \todo       Should async instead be a pointer to a place to store a token that can be useful
+//!             for synchronizing two streams alter?
+//! \return     NVCV_SUCCESS     if the operation was successful.
+//! \return     NVCV_ERR_EFFECT  if an invalid effect handle was supplied.
+NvCV_Status NvVFX_API NvVFX_Run(NvVFX_Handle effect, int async);
+
+//! Load the model based on the set params.
+//! \param[in]  effect     the effect object handle.
+//! \return     NVCV_SUCCESS     if the operation was successful.
+//! \return     NVCV_ERR_EFFECT  if an invalid effect handle was supplied.
+NvCV_Status NvVFX_API NvVFX_Load(NvVFX_Handle effect);
+
+//! Wrapper for cudaStreamCreate(), if it is desired to avoid linking with the cuda lib.
+//! \param[out] stream  A place to store the newly allocated stream.
+//! \return     NVCV_SUCCESS         if the operation was successful,
+//!             NVCV_ERR_CUDA_VALUE  if not.
+NvCV_Status NvVFX_API NvVFX_CudaStreamCreate(CUstream *stream);
+
+//! Wrapper for cudaStreamDestroy(), if it is desired to avoid linking with the cuda lib.
+//! \param[in]  stream  The stream to destroy.
+//! \return     NVCV_SUCCESS         if the operation was successful,
+//!             NVCV_ERR_CUDA_VALUE  if not.
+NvCV_Status NvVFX_API NvVFX_CudaStreamDestroy(CUstream stream);
+
+//! Allocate the state object handle for a feature.
+//! \param[in]  effect   the effect object handle.
+//! \param[in]  handle   handle to the state object
+//! \return     NVCV_SUCCESS    if the operation was successful.
+//! \return     NVCV_ERR_EFFECT  if an invalid effect handle was supplied.
+//! \note This may depend on prior settings of parameters.
+NvCV_Status NvVFX_API NvVFX_AllocateState(NvVFX_Handle effect, NvVFX_StateObjectHandle* handle);
+
+//! Deallocate the state object handle for stateful feature.
+//! \param[in]  effect   the effect object handle.
+//! \param[in]  handle   handle to the state object
+//! \return     NVCV_SUCCESS    if the operation was successful.
+//! \return     NVCV_ERR_EFFECT  if an invalid effect handle was supplied.
+NvCV_Status NvVFX_API NvVFX_DeallocateState(NvVFX_Handle effect, NvVFX_StateObjectHandle handle);
+
+//! Reset the state object handle for stateful feature.
+//! \param[in]  effect   the effect object handle.
+//! \param[in]  handle   handle to the state object
+//! \return     NVCV_SUCCESS    if the operation was successful.
+//! \return     NVCV_ERR_EFFECT  if an invalid effect handle was supplied.
+NvCV_Status NvVFX_API NvVFX_ResetState(NvVFX_Handle effect, NvVFX_StateObjectHandle handle);
+
+
+// Filter selectors
+#define NVVFX_FX_TRANSFER               "Transfer"
+#define NVVFX_FX_GREEN_SCREEN           "GreenScreen"         // Green Screen 
+#define NVVFX_FX_BGBLUR                 "BackgroundBlur"     // Background blur
+#define NVVFX_FX_ARTIFACT_REDUCTION     "ArtifactReduction"   // Artifact Reduction  
+#define NVVFX_FX_SUPER_RES              "SuperRes"            // Super Res 
+#define NVVFX_FX_SR_UPSCALE             "Upscale"             // Super Res Upscale 
+#define NVVFX_FX_DENOISING              "Denoising"           // Denoising 
+
+// Parameter selectors
+#define NVVFX_INPUT_IMAGE_0             "SrcImage0"           //!< There may be multiple input images
+#define NVVFX_INPUT_IMAGE               NVVFX_INPUT_IMAGE_0   //!< but there is usually only one input image
+#define NVVFX_INPUT_IMAGE_1             "SrcImage1"           //!< Source Image 1
+#define NVVFX_OUTPUT_IMAGE_0            "DstImage0"           //!< There may be multiple output images
+#define NVVFX_OUTPUT_IMAGE              NVVFX_OUTPUT_IMAGE_0  //!< but there is usually only one output image
+#define NVVFX_MODEL_DIRECTORY           "ModelDir"            //!< The directory where the model may be found
+#define NVVFX_CUDA_STREAM               "CudaStream"          //!< The CUDA stream to use
+#define NVVFX_CUDA_GRAPH                "CudaGraph"           //!< Enable CUDA graph to use
+#define NVVFX_INFO                      "Info"                //!< Get info about the effects
+#define NVVFX_MAX_INPUT_WIDTH           "MaxInputWidth"       //!< Maximum width of the input supported
+#define NVVFX_MAX_INPUT_HEIGHT          "MaxInputHeight"      //!< Maximum height of the input supported
+#define NVVFX_MAX_NUMBER_STREAMS        "MaxNumberStreams"  //!< Maximum number of concurrent input streams
+#define NVVFX_SCALE                     "Scale"               //!< Scale factor
+#define NVVFX_STRENGTH                  "Strength"            //!< Strength for different filters
+#define NVVFX_STRENGTH_LEVELS           "StrengthLevels"      //!< Number of strength levels
+#define NVVFX_MODE                      "Mode"                //!< Mode for different filters
+#define NVVFX_TEMPORAL                  "Temporal"            //!< Temporal mode: 0=image, 1=video
+#define NVVFX_GPU                       "GPU"                 //!< Preferred GPU (optional)
+#define NVVFX_BATCH_SIZE                "BatchSize"           //!< Batch Size (default 1)
+#define NVVFX_MODEL_BATCH               "ModelBatch"          //!< The preferred batching model to use (default 1)
+#define NVVFX_STATE                     "State"               //!< State variable  
+#define NVVFX_STATE_SIZE                "StateSize"           //!< Number of bytes needed to store state  
+#define NVVFX_STATE_COUNT               "NumStateObjects"     //!< Number of active state object handles
+
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // __NVVIDEO_EFFECTS_H__
--- a/nvvfx/src/NVVideoEffectsProxy.cpp
+++ b/nvvfx/src/NVVideoEffectsProxy.cpp
@ -0,0 +1,307 @@
+#if defined(linux) || defined(unix) || defined(__linux)
+#warning nvVideoEffectsProxy.cpp not ported
+#else // _WIN32_
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+#include <string>
+
+#include "nvVideoEffects.h"
+
+#ifdef _WIN32
+  #define _WINSOCKAPI_
+  #include <windows.h>
+  #include <tchar.h>
+#else // !_WIN32
+  #include <dlfcn.h>
+  typedef void* HMODULE;
+  typedef void* HANDLE;
+  typedef void* HINSTANCE;
+#endif // _WIN32
+
+// Parameter string does not include the file extension
+#ifdef _WIN32
+  #define nvLoadLibrary(library) LoadLibrary(TEXT(library ".dll"))
+#else // !_WIN32
+  #define nvLoadLibrary(library) dlopen("lib" library ".so", RTLD_LAZY)
+#endif // _WIN32
+
+
+inline void* nvGetProcAddress(HINSTANCE handle, const char* proc) {
+  if (nullptr == handle) return nullptr;
+#ifdef _WIN32
+  return GetProcAddress(handle, proc);
+#else // !_WIN32
+  return dlsym(handle, proc);
+#endif // _WIN32
+}
+
+inline int nvFreeLibrary(HINSTANCE handle) {
+#ifdef _WIN32
+  return FreeLibrary(handle);
+#else // !_WIN32
+  return dlclose(handle);
+#endif // _WIN32
+}
+
+HINSTANCE getNvVfxLib() {
+
+  TCHAR path[MAX_PATH], fullPath[MAX_PATH];
+  bool bSDKPathSet = false;
+
+  extern char* g_nvVFXSDKPath;
+  if (g_nvVFXSDKPath && g_nvVFXSDKPath[0]) {
+#ifndef UNICODE
+    strncpy_s(fullPath, MAX_PATH, g_nvVFXSDKPath, MAX_PATH);
+    
+#else // !UNICODE
+    size_t res = 0;
+    mbstowcs_s(&res, fullPath, MAX_PATH, g_nvVFXSDKPath, MAX_PATH);
+#endif // UNICODE
+    SetDllDirectory(fullPath);
+    bSDKPathSet = true;
+  }
+
+  if (!bSDKPathSet) {
+    // There can be multiple apps on the system,
+    // some might include the SDK in the app package and
+    // others might expect the SDK to be installed in Program Files
+    GetEnvironmentVariable(TEXT("NV_VIDEO_EFFECTS_PATH"), path, MAX_PATH);
+    if (_tcscmp(path, TEXT("USE_APP_PATH"))) {
+      // App has not set environment variable to "USE_APP_PATH"
+      // So pick up the SDK dll and dependencies from Program Files
+      GetEnvironmentVariable(TEXT("ProgramFiles"), path, MAX_PATH);
+      size_t max_len = sizeof(fullPath) / sizeof(TCHAR);
+      _stprintf_s(fullPath, max_len, TEXT("%s\\NVIDIA Corporation\\NVIDIA Video Effects\\"), path);
+      SetDllDirectory(fullPath);
+    }
+  }
+  
+  static const HINSTANCE NvVfxLib = nvLoadLibrary("NVVideoEffects");
+  return NvVfxLib;
+}
+
+NvCV_Status NvVFX_API NvVFX_GetVersion(unsigned int* version) {
+  static const auto funcPtr = (decltype(NvVFX_GetVersion)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetVersion");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(version);
+}
+
+NvCV_Status NvVFX_API NvVFX_CreateEffect(NvVFX_EffectSelector code, NvVFX_Handle* obj) {
+  static const auto funcPtr = (decltype(NvVFX_CreateEffect)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_CreateEffect");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(code, obj);
+}
+
+void NvVFX_API NvVFX_DestroyEffect(NvVFX_Handle obj) {
+  static const auto funcPtr = (decltype(NvVFX_DestroyEffect)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_DestroyEffect");
+
+  if (nullptr != funcPtr) funcPtr(obj);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetU32(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, unsigned int val) {
+  static const auto funcPtr = (decltype(NvVFX_SetU32)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetU32");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetS32(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, int val) {
+  static const auto funcPtr = (decltype(NvVFX_SetS32)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetS32");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetF32(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, float val) {
+  static const auto funcPtr = (decltype(NvVFX_SetF32)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetF32");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetF64(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, double val) {
+  static const auto funcPtr = (decltype(NvVFX_SetF64)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetF64");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetU64(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, unsigned long long val) {
+  static const auto funcPtr = (decltype(NvVFX_SetU64)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetU64");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetImage(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, NvCVImage* im) {
+  static const auto funcPtr = (decltype(NvVFX_SetImage)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetImage");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, im);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetObject(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, void* ptr) {
+  static const auto funcPtr = (decltype(NvVFX_SetObject)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetObject");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, ptr);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetStateObjectHandleArray(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, NvVFX_StateObjectHandle* handle) {
+  static const auto funcPtr = (decltype(NvVFX_SetStateObjectHandleArray)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetStateObjectHandleArray");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, handle);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetString(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, const char* str) {
+  static const auto funcPtr = (decltype(NvVFX_SetString)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetString");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, str);
+}
+
+NvCV_Status NvVFX_API NvVFX_SetCudaStream(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, CUstream stream) {
+  static const auto funcPtr = (decltype(NvVFX_SetCudaStream)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_SetCudaStream");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, stream);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetU32(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, unsigned int* val) {
+  static const auto funcPtr = (decltype(NvVFX_GetU32)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetU32");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetS32(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, int* val) {
+  static const auto funcPtr = (decltype(NvVFX_GetS32)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetS32");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetF32(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, float* val) {
+  static const auto funcPtr = (decltype(NvVFX_GetF32)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetF32");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetF64(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, double* val) {
+  static const auto funcPtr = (decltype(NvVFX_GetF64)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetF64");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetU64(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, unsigned long long* val) {
+  static const auto funcPtr = (decltype(NvVFX_GetU64)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetU64");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, val);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetImage(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, NvCVImage* im) {
+  static const auto funcPtr = (decltype(NvVFX_GetImage)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetImage");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, im);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetObject(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, void** ptr) {
+  static const auto funcPtr = (decltype(NvVFX_GetObject)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetObject");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, ptr);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetString(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, const char** str) {
+  static const auto funcPtr = (decltype(NvVFX_GetString)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetString");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, str);
+}
+
+NvCV_Status NvVFX_API NvVFX_GetCudaStream(NvVFX_Handle obj, NvVFX_ParameterSelector paramName, CUstream* stream) {
+  static const auto funcPtr = (decltype(NvVFX_GetCudaStream)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_GetCudaStream");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, paramName, stream);
+}
+
+NvCV_Status NvVFX_API NvVFX_Run(NvVFX_Handle obj, int async) {
+  static const auto funcPtr = (decltype(NvVFX_Run)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_Run");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, async);
+}
+
+NvCV_Status NvVFX_API NvVFX_Load(NvVFX_Handle obj) {
+  static const auto funcPtr = (decltype(NvVFX_Load)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_Load");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj);
+}
+
+NvCV_Status NvVFX_API NvVFX_CudaStreamCreate(CUstream* stream) {
+  static const auto funcPtr =
+      (decltype(NvVFX_CudaStreamCreate)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_CudaStreamCreate");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(stream);
+}
+
+NvCV_Status NvVFX_API NvVFX_CudaStreamDestroy(CUstream stream) {
+  static const auto funcPtr =
+      (decltype(NvVFX_CudaStreamDestroy)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_CudaStreamDestroy");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(stream);
+}
+
+NvCV_Status NvVFX_API NvVFX_AllocateState(NvVFX_Handle obj, NvVFX_StateObjectHandle* handle) {
+  static const auto funcPtr = (decltype(NvVFX_AllocateState)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_AllocateState");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, handle);
+}
+
+NvCV_Status NvVFX_API NvVFX_DeallocateState(NvVFX_Handle obj, NvVFX_StateObjectHandle handle) {
+  static const auto funcPtr = (decltype(NvVFX_DeallocateState)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_DeallocateState");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, handle);
+}
+
+NvCV_Status NvVFX_API NvVFX_ResetState(NvVFX_Handle obj, NvVFX_StateObjectHandle handle) {
+  static const auto funcPtr = (decltype(NvVFX_ResetState)*)nvGetProcAddress(getNvVfxLib(), "NvVFX_ResetState");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(obj, handle);
+}
+
+#endif // enabling for this file
--- a/nvvfx/src/nvCVImageProxy.cpp
+++ b/nvvfx/src/nvCVImageProxy.cpp
@ -0,0 +1,341 @@
+#if defined(linux) || defined(unix) || defined(__linux)
+#warning nvCVImageProxy.cpp not ported
+#else
+/*###############################################################################
+#
+# Copyright 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+#include <string>
+#include "nvCVImage.h"
+
+#ifdef _WIN32
+  #define _WINSOCKAPI_
+  #include <windows.h>
+  #include <tchar.h>
+  #include "nvTransferD3D.h"
+  #include "nvTransferD3D11.h"
+#else // !_WIN32
+  #include <dlfcn.h>
+  typedef void* HMODULE;
+  typedef void* HANDLE;
+  typedef void* HINSTANCE;
+#endif // _WIN32
+
+// Parameter string does not include the file extension
+#ifdef _WIN32
+#define nvLoadLibrary(library) LoadLibrary(TEXT(library ".dll"))
+#else // !_WIN32
+#define nvLoadLibrary(library) dlopen("lib" library ".so", RTLD_LAZY)
+#endif // _WIN32
+
+
+inline void* nvGetProcAddress(HINSTANCE handle, const char* proc) {
+  if (nullptr == handle) return nullptr;
+#ifdef _WIN32
+  return GetProcAddress(handle, proc);
+#else // !_WIN32
+  return dlsym(handle, proc);
+#endif // _WIN32
+}
+
+inline int nvFreeLibrary(HINSTANCE handle) {
+#ifdef _WIN32
+  return FreeLibrary(handle);
+#else
+  return dlclose(handle);
+#endif
+}
+
+HINSTANCE getNvCVImageLib() {
+  TCHAR path[MAX_PATH], tmpPath[MAX_PATH], fullPath[MAX_PATH];
+  static HINSTANCE nvCVImageLib = NULL;
+  static bool bSDKPathSet = false;
+  if (!bSDKPathSet) {
+    nvCVImageLib = nvLoadLibrary("NVCVImage");
+    if (nvCVImageLib)  bSDKPathSet = true;
+  }
+  if (!bSDKPathSet) {
+    // There can be multiple apps on the system,
+    // some might include the SDK in the app package and
+    // others might expect the SDK to be installed in Program Files
+    GetEnvironmentVariable(TEXT("NV_VIDEO_EFFECTS_PATH"), path, MAX_PATH);
+    GetEnvironmentVariable(TEXT("NV_AR_SDK_PATH"), tmpPath, MAX_PATH);
+    if (_tcscmp(path, TEXT("USE_APP_PATH")) && _tcscmp(tmpPath, TEXT("USE_APP_PATH"))) {
+      // App has not set environment variable to "USE_APP_PATH"
+      // So pick up the SDK dll and dependencies from Program Files
+      GetEnvironmentVariable(TEXT("ProgramFiles"), path, MAX_PATH);
+      size_t max_len = sizeof(fullPath) / sizeof(TCHAR);
+      _stprintf_s(fullPath, max_len, TEXT("%s\\NVIDIA Corporation\\NVIDIA Video Effects\\"), path);
+      SetDllDirectory(fullPath);
+      nvCVImageLib = nvLoadLibrary("NVCVImage");
+      if (!nvCVImageLib) {
+        _stprintf_s(fullPath, max_len, TEXT("%s\\NVIDIA Corporation\\NVIDIA AR SDK\\"), path);
+        SetDllDirectory(fullPath);
+        nvCVImageLib = nvLoadLibrary("NVCVImage");
+      }
+    }
+    bSDKPathSet = true;
+  }
+  return nvCVImageLib;
+}
+ 
+NvCV_Status NvCV_API NvCVImage_Init(NvCVImage* im, unsigned width, unsigned height, int pitch, void* pixels,
+                                       NvCVImage_PixelFormat format, NvCVImage_ComponentType type, unsigned isPlanar,
+                                       unsigned onGPU) {
+  static const auto funcPtr = (decltype(NvCVImage_Init)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Init");
+  
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(im, width, height, pitch, pixels, format, type, isPlanar, onGPU);
+}
+
+void NvCV_API NvCVImage_InitView(NvCVImage* subImg, NvCVImage* fullImg, int x, int y, unsigned width,
+                                   unsigned height) {
+  static const auto funcPtr = (decltype(NvCVImage_InitView)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_InitView");
+ 
+  if (nullptr != funcPtr) funcPtr(subImg, fullImg, x, y, width, height);
+}
+
+NvCV_Status NvCV_API NvCVImage_Alloc(NvCVImage* im, unsigned width, unsigned height, NvCVImage_PixelFormat format,
+                              NvCVImage_ComponentType type, unsigned isPlanar, unsigned onGPU, unsigned alignment) {
+  static const auto funcPtr = (decltype(NvCVImage_Alloc)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Alloc");
+  
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(im, width, height, format, type, isPlanar, onGPU, alignment);
+}
+
+NvCV_Status NvCV_API NvCVImage_Realloc(NvCVImage* im, unsigned width, unsigned height,
+                                          NvCVImage_PixelFormat format, NvCVImage_ComponentType type,
+                                          unsigned isPlanar, unsigned onGPU, unsigned alignment) {
+  static const auto funcPtr = (decltype(NvCVImage_Realloc)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Realloc");
+  
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(im, width, height, format, type, isPlanar, onGPU, alignment);
+}
+
+void NvCV_API NvCVImage_Dealloc(NvCVImage* im) {
+  static const auto funcPtr = (decltype(NvCVImage_Dealloc)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Dealloc");
+
+  if (nullptr != funcPtr) funcPtr(im);
+}
+
+void NvCV_API NvCVImage_DeallocAsync(NvCVImage* im,  CUstream_st* stream) {
+  static const auto funcPtr = (decltype(NvCVImage_DeallocAsync)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_DeallocAsync");
+
+  if (nullptr != funcPtr) funcPtr(im, stream);
+}
+
+NvCV_Status NvCV_API NvCVImage_Create(unsigned width, unsigned height, NvCVImage_PixelFormat format,
+                                         NvCVImage_ComponentType type, unsigned isPlanar, unsigned onGPU,
+                                         unsigned alignment, NvCVImage** out) {
+  static const auto funcPtr = (decltype(NvCVImage_Create)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Create");
+  
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(width, height, format, type, isPlanar, onGPU, alignment, out);
+}
+
+void NvCV_API NvCVImage_Destroy(NvCVImage* im) {
+  static const auto funcPtr = (decltype(NvCVImage_Destroy)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Destroy");
+  
+  if (nullptr != funcPtr) funcPtr(im);
+}
+
+void NvCV_API NvCVImage_ComponentOffsets(NvCVImage_PixelFormat format, int* rOff, int* gOff, int* bOff, int* aOff,
+                                           int* yOff) {
+  static const auto funcPtr =
+      (decltype(NvCVImage_ComponentOffsets)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_ComponentOffsets");
+  
+  if (nullptr != funcPtr) funcPtr(format, rOff, gOff, bOff, aOff, yOff);
+}
+
+NvCV_Status NvCV_API NvCVImage_Transfer(const NvCVImage* src, NvCVImage* dst, float scale, CUstream_st* stream,
+                                           NvCVImage* tmp) {
+  static const auto funcPtr = (decltype(NvCVImage_Transfer)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Transfer");
+  
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(src, dst, scale, stream, tmp);
+}
+
+NvCV_Status NvCV_API NvCVImage_TransferRect(const NvCVImage *src, const NvCVRect2i *srcRect, NvCVImage *dst,
+  const NvCVPoint2i *dstPt, float scale, struct CUstream_st *stream, NvCVImage *tmp) {
+  static const auto funcPtr = (decltype(NvCVImage_TransferRect)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_TransferRect");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(src, srcRect, dst, dstPt, scale, stream, tmp);
+}
+
+NvCV_Status NvCV_API NvCVImage_TransferFromYUV(const void *y, int yPixBytes, int yPitch, const void *u, const void *v,
+  int uvPixBytes, int uvPitch, NvCVImage_PixelFormat yuvFormat, NvCVImage_ComponentType yuvType, unsigned yuvColorSpace,
+  unsigned yuvMemSpace, NvCVImage *dst, const NvCVRect2i *dstRect, float scale, struct CUstream_st *stream, NvCVImage *tmp) {
+  static const auto funcPtr = (decltype(NvCVImage_TransferFromYUV)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_TransferFromYUV");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(y, yPixBytes, yPitch, u, v, uvPixBytes, uvPitch, yuvFormat, yuvType, yuvColorSpace, yuvMemSpace, dst,
+    dstRect, scale, stream, tmp);
+}
+
+NvCV_Status NvCV_API NvCVImage_TransferToYUV(const NvCVImage *src, const NvCVRect2i *srcRect, 
+  const void *y, int yPixBytes, int yPitch, const void *u, const void *v, int uvPixBytes, int uvPitch,
+  NvCVImage_PixelFormat yuvFormat, NvCVImage_ComponentType yuvType, unsigned yuvColorSpace, unsigned yuvMemSpace,
+  float scale, struct CUstream_st *stream, NvCVImage *tmp) {
+  static const auto funcPtr = (decltype(NvCVImage_TransferToYUV)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_TransferToYUV");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(src, srcRect, y, yPixBytes, yPitch, u, v, uvPixBytes, uvPitch, yuvFormat, yuvType, yuvColorSpace, yuvMemSpace, scale, stream, tmp);
+}
+
+NvCV_Status NvCV_API NvCVImage_MapResource(NvCVImage *im, struct CUstream_st *stream) {
+  static const auto funcPtr = (decltype(NvCVImage_MapResource)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_MapResource");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(im, stream);
+}
+
+NvCV_Status NvCV_API NvCVImage_UnmapResource(NvCVImage *im, struct CUstream_st *stream) {
+  static const auto funcPtr = (decltype(NvCVImage_UnmapResource)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_UnmapResource");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(im, stream);
+}
+
+#if RTX_CAMERA_IMAGE == 0
+NvCV_Status NvCV_API NvCVImage_Composite(const NvCVImage* fg, const NvCVImage* bg, const NvCVImage* mat, NvCVImage* dst,
+    struct CUstream_st *stream) {
+  static const auto funcPtr = (decltype(NvCVImage_Composite)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Composite");
+   
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(fg, bg, mat, dst, stream);
+}
+#else //  RTX_CAMERA_IMAGE == 1
+NvCV_Status NvCV_API NvCVImage_Composite(const NvCVImage* fg, const NvCVImage* bg, const NvCVImage* mat, NvCVImage* dst) {
+  static const auto funcPtr = (decltype(NvCVImage_Composite)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Composite");
+   
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(fg, bg, mat, dst);
+}
+#endif //  RTX_CAMERA_IMAGE
+
+NvCV_Status NvCV_API NvCVImage_CompositeRect(
+      const NvCVImage *fg,  const NvCVPoint2i *fgOrg,
+      const NvCVImage *bg,  const NvCVPoint2i *bgOrg,
+      const NvCVImage *mat, unsigned mode,
+      NvCVImage       *dst, const NvCVPoint2i *dstOrg,
+      struct CUstream_st *stream) {
+  static const auto funcPtr = (decltype(NvCVImage_CompositeRect)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_CompositeRect");
+   
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(fg, fgOrg, bg, bgOrg, mat, mode, dst, dstOrg, stream);
+}
+
+#if RTX_CAMERA_IMAGE == 0
+NvCV_Status NvCV_API NvCVImage_CompositeOverConstant(const NvCVImage *src, const NvCVImage *mat,
+  const void *bgColor, NvCVImage *dst, struct CUstream_st *stream) {
+  static const auto funcPtr =
+    (decltype(NvCVImage_CompositeOverConstant)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_CompositeOverConstant");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(src, mat, bgColor, dst, stream);
+}
+#else // RTX_CAMERA_IMAGE == 1
+NvCV_Status NvCV_API NvCVImage_CompositeOverConstant(const NvCVImage *src, const NvCVImage *mat,
+                                                     const unsigned char bgColor[3], NvCVImage *dst) {
+  static const auto funcPtr =
+      (decltype(NvCVImage_CompositeOverConstant)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_CompositeOverConstant");
+   
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(src, mat, bgColor, dst);
+}
+#endif // RTX_CAMERA_IMAGE
+
+
+NvCV_Status NvCV_API NvCVImage_FlipY(const NvCVImage *src, NvCVImage *dst) {
+  static const auto funcPtr = (decltype(NvCVImage_FlipY)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_FlipY");
+   
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(src, dst);
+}
+
+NvCV_Status NvCV_API NvCVImage_Sharpen(float sharpness, const NvCVImage *src, NvCVImage *dst,
+    struct CUstream_st *stream, NvCVImage *tmp) {
+  static const auto funcPtr = (decltype(NvCVImage_Sharpen)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_Sharpen");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(sharpness, src, dst, stream, tmp);
+}
+
+#ifdef _WIN32
+__declspec(dllexport) const char* __cdecl
+#else
+const char*
+#endif  // _WIN32 or linux
+    NvCV_GetErrorStringFromCode(NvCV_Status code) {
+  static const auto funcPtr =
+      (decltype(NvCV_GetErrorStringFromCode)*)nvGetProcAddress(getNvCVImageLib(), "NvCV_GetErrorStringFromCode");
+  
+  if (nullptr == funcPtr) return "Cannot find nvCVImage DLL or its dependencies";
+  return funcPtr(code);
+}
+
+
+
+#ifdef _WIN32 // Direct 3D
+
+NvCV_Status NvCV_API NvCVImage_InitFromD3D11Texture(NvCVImage *im, struct ID3D11Texture2D *tx) {
+  static const auto funcPtr = (decltype(NvCVImage_InitFromD3D11Texture)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_InitFromD3D11Texture");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(im, tx);
+}
+
+NvCV_Status NvCV_API NvCVImage_ToD3DFormat(NvCVImage_PixelFormat format, NvCVImage_ComponentType type, unsigned layout, DXGI_FORMAT *d3dFormat) {
+  static const auto funcPtr = (decltype(NvCVImage_ToD3DFormat)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_ToD3DFormat");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(format, type, layout, d3dFormat);
+}
+
+NvCV_Status NvCV_API NvCVImage_FromD3DFormat(DXGI_FORMAT d3dFormat, NvCVImage_PixelFormat *format, NvCVImage_ComponentType *type, unsigned char *layout) {
+  static const auto funcPtr = (decltype(NvCVImage_FromD3DFormat)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_FromD3DFormat");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(d3dFormat, format, type, layout);
+}
+
+#ifdef __dxgicommon_h__
+
+NvCV_Status NvCV_API NvCVImage_ToD3DColorSpace(unsigned char nvcvColorSpace, DXGI_COLOR_SPACE_TYPE *pD3dColorSpace) {
+  static const auto funcPtr = (decltype(NvCVImage_ToD3DColorSpace)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_ToD3DColorSpace");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(nvcvColorSpace, pD3dColorSpace);
+}
+
+NvCV_Status NvCV_API NvCVImage_FromD3DColorSpace(DXGI_COLOR_SPACE_TYPE d3dColorSpace, unsigned char *pNvcvColorSpace) {
+  static const auto funcPtr = (decltype(NvCVImage_FromD3DColorSpace)*)nvGetProcAddress(getNvCVImageLib(), "NvCVImage_FromD3DColorSpace");
+
+  if (nullptr == funcPtr) return NVCV_ERR_LIBRARY;
+  return funcPtr(d3dColorSpace, pNvcvColorSpace);
+}
+
+#endif // __dxgicommon_h__
+
+#endif // _WIN32 Direct 3D
+
+#endif // enabling for this file
--- a/resources/Denoise.gif
+++ b/resources/Denoise.gif
--- a/resources/SR.gif
+++ b/resources/SR.gif
--- a/samples/AigsEffectApp/AigsEffectApp.cpp
+++ b/samples/AigsEffectApp/AigsEffectApp.cpp
--- a/samples/AigsEffectApp/AigsEffectApp.exe
+++ b/samples/AigsEffectApp/AigsEffectApp.exe
--- a/samples/AigsEffectApp/CMakeLists.txt
+++ b/samples/AigsEffectApp/CMakeLists.txt
@ -0,0 +1,43 @@
+set(SOURCE_FILES
+    AigsEffectApp.cpp
+    ../../nvvfx/src/nvVideoEffectsProxy.cpp
+    ../../nvvfx/src/nvCVImageProxy.cpp)
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(AigsEffectApp ${SOURCE_FILES})
+target_include_directories(AigsEffectApp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../utils
+    )
+target_include_directories(AigsEffectApp PUBLIC
+    ${SDK_INCLUDES_PATH}
+    )
+
+if(MSVC)
+
+    target_link_libraries(AigsEffectApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(PATH_STR "PATH=%PATH%" ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "--show --in_file=\"${CMAKE_CURRENT_SOURCE_DIR}/../input/input_003054.jpg\" ")
+    set_target_properties(AigsEffectApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(AigsEffectApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
--- a/samples/AigsEffectApp/run.bat
+++ b/samples/AigsEffectApp/run.bat
@ -0,0 +1,4 @@
+SETLOCAL
+SET PATH=%PATH%;..\external\opencv\bin;
+REM AigsEffectApp.exe --in_file=..\input\input_003054.jpg --show
+AigsEffectApp.exe --webcam --show
--- a/samples/BatchEffectApp/BatchAigsEffectApp.cpp
+++ b/samples/BatchEffectApp/BatchAigsEffectApp.cpp
@ -0,0 +1,370 @@
+/*###############################################################################
+#
+# Copyright (c) 2022 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#include <string>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+#include "BatchUtilities.h"
+#include "nvCVOpenCV.h"
+#include "nvVideoEffects.h"
+#include "opencv2/opencv.hpp"
+
+#ifdef _MSC_VER
+  #define strcasecmp _stricmp
+#endif // _MSC_VER
+
+#define BAIL_IF_ERR(err)            do { if (0 != (err)) {                      goto bail; } } while(0)
+#define BAIL_IF_NULL(x, err, code)  do { if ((void*)(x) == NULL)  { err = code; goto bail; } } while(0)
+#define BAIL_IF_FALSE(x, err, code) do { if (!(x))                { err = code; goto bail; } } while(0)
+#define BAIL(err, code)             do {                            err = code; goto bail;   } while(0)
+
+#ifdef _WIN32
+  #define DEFAULT_CODEC "avc1"
+#else // !_WIN32
+  #define DEFAULT_CODEC "H264"
+#endif // _WIN32
+
+bool                      FLAG_verbose        = false;
+int                       FLAG_mode           = 0;
+std::string               FLAG_outFile,
+                          FLAG_modelDir,
+                          FLAG_codec          = DEFAULT_CODEC;
+std::vector<const char*>  FLAG_inFiles;
+
+// Set this when using OTA Updates
+// This path is used by nvVideoEffectsProxy.cpp to load the SDK dll
+// when using  OTA Updates
+char *g_nvVFXSDKPath = NULL;
+
+static bool GetFlagArgVal(const char *flag, const char *arg, const char **val) {
+  if (*arg != '-')
+    return false;
+  while (*++arg == '-')
+    continue;
+  const char *s = strchr(arg, '=');
+  if (s == NULL)  {
+    if (strcmp(flag, arg) != 0)
+      return false;
+    *val = NULL;
+    return true;
+  }
+  size_t n = s - arg;
+  if ((strlen(flag) != n) || (strncmp(flag, arg, n) != 0))
+    return false;
+  *val = s + 1;
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, std::string *val) {
+  const char *valStr;
+  if (!GetFlagArgVal(flag, arg, &valStr))
+    return false;
+  val->assign(valStr ? valStr : "");
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, bool *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success) {
+    *val = (valStr == NULL ||
+      strcasecmp(valStr, "true") == 0 ||
+      strcasecmp(valStr, "on")   == 0 ||
+      strcasecmp(valStr, "yes")  == 0 ||
+      strcasecmp(valStr, "1")    == 0
+      );
+  }
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, long *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtol(valStr, NULL, 10);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, int *val) {
+  long longVal;
+  bool success = GetFlagArgVal(flag, arg, &longVal);
+  if (success)
+    *val = (int)longVal;
+  return success;
+}
+
+static int StringToFourcc(const std::string &str) {
+  union chint {
+    int i;
+    char c[4];
+  };
+  chint x = {0};
+  for (int n = (str.size() < 4) ? (int)str.size() : 4; n--;) x.c[n] = str[n];
+  return x.i;
+}
+
+static void Usage() {
+  printf(
+    "BatchAigsEffectApp [flags ...] inFile1 [ inFileN ...]\n"
+    "  where flags is:\n"
+    "  --out_file=<path>     output video files to be written (a pattern with one %%u or %%d), default \"BatchOut_%%02u.mp4\"\n"
+    "  --model_dir=<path>    the path to the directory that contains the models\n"
+    "  --mode=<value>        which model to pick for processing (default: 0)\n"
+    "  --verbose             verbose output\n"
+    "  --codec=<fourcc>      the fourcc code for the desired codec (default " DEFAULT_CODEC ")\n"
+    "  and inFile1 ... are identically sized video files\n"
+  );
+}
+
+static int ParseMyArgs(int argc, char **argv) {
+  int errs = 0;
+  for (--argc, ++argv; argc--; ++argv) {
+    bool help;
+    const char *arg = *argv;
+    if (arg[0] == '-') {
+      if (arg[1] == '-') {                                      // double-dash
+        if (GetFlagArgVal("verbose",    arg, &FLAG_verbose)   ||
+            GetFlagArgVal("mode",       arg, &FLAG_mode)      ||
+            GetFlagArgVal("model_dir",  arg, &FLAG_modelDir)  ||
+            GetFlagArgVal("out_file",   arg, &FLAG_outFile)   ||
+            GetFlagArgVal("codec",      arg, &FLAG_codec)
+        ) {
+          continue;
+        } else if (GetFlagArgVal("help", arg, &help)) {         // --help
+          Usage();
+          errs = 1;
+        }
+      }
+      else {                                                    // single dash
+        for (++arg; *arg; ++arg) {
+          if (*arg == 'v') {
+            FLAG_verbose = true;
+          } else {
+            printf("Unknown flag ignored: \"-%c\"\n", *arg);
+          }
+        }
+        continue;
+      }
+    }
+    else {                                                      // no dash
+      FLAG_inFiles.push_back(arg);
+    }
+  }
+  return errs;
+}
+
+
+class App {
+public:
+  NvVFX_Handle  _eff;
+  NvCVImage     _src, _stg, _dst;
+  CUstream      _stream;
+  unsigned      _batchSize;
+
+
+  App() : _eff(nullptr), _stream(0), _batchSize(0) {}
+  ~App() {
+    NvVFX_DestroyEffect(_eff); if (_stream) NvVFX_CudaStreamDestroy(_stream);
+  }
+
+  NvCV_Status init(const char* effectName, unsigned batchSize, unsigned int mode, const NvCVImage *srcImg) {
+    NvCV_Status err = NVCV_ERR_UNIMPLEMENTED;
+ 
+    _batchSize = batchSize;
+    BAIL_IF_ERR(err = NvVFX_CreateEffect(effectName, &_eff));
+
+    BAIL_IF_ERR(err = AllocateBatchBuffer(&_src, _batchSize, srcImg->width, srcImg->height, NVCV_BGR, NVCV_U8, NVCV_CHUNKY, NVCV_GPU, 1));
+    BAIL_IF_ERR(err = AllocateBatchBuffer(&_dst, _batchSize, srcImg->width, srcImg->height, NVCV_A, NVCV_U8, NVCV_CHUNKY, NVCV_GPU, 1));
+    BAIL_IF_ERR(err = NvVFX_SetString(_eff, NVVFX_MODEL_DIRECTORY, FLAG_modelDir.c_str()));
+
+
+    { // Set parameters.
+      NvCVImage nth;
+      BAIL_IF_ERR(err = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE,  NthImage(0, srcImg->height,              &_src, &nth)));  // Set the first of the batched images in ...
+      BAIL_IF_ERR(err = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, NthImage(0, _dst.height / _batchSize, &_dst, &nth)));  // ... and out
+      BAIL_IF_ERR(err = NvVFX_CudaStreamCreate(&_stream));
+      BAIL_IF_ERR(err = NvVFX_SetCudaStream(_eff, NVVFX_CUDA_STREAM, _stream));
+      BAIL_IF_ERR(err = NvVFX_SetU32(_eff, NVVFX_MODE, mode));
+    }
+
+  bail:
+    return err;
+  }
+};
+
+
+NvCV_Status BatchProcess(const char* effectName, unsigned int mode,
+  const std::vector<const char*>& srcVideos, const char *outfilePattern, std::string codec) {
+  NvCV_Status err       = NVCV_SUCCESS;
+  App         app;
+  cv::Mat     ocv1, ocv2;
+  NvCVImage   nvx1, nvx2;
+  unsigned    srcWidth, srcHeight, dstHeight;
+
+  std::vector<NvVFX_StateObjectHandle> arrayOfStates;
+  NvVFX_StateObjectHandle* batchOfStates = nullptr;
+
+  unsigned int numOfVideoStreams = static_cast<unsigned int>(srcVideos.size());
+
+  // If valid states are passed for inference, then -
+  // 1. Effect can only process a batch which is equal to maximum number of video streams
+  // 2. Multiple frames from the same video stream should not be present in the same batch
+  unsigned batchSize = numOfVideoStreams;
+
+  std::vector<cv::VideoCapture> srcCaptures(numOfVideoStreams);
+  std::vector<cv::VideoWriter> dstWriters(numOfVideoStreams);
+  for (unsigned int i = 0; i < numOfVideoStreams; i++) {
+    srcCaptures[i].open(srcVideos[i]);
+    if (srcCaptures[i].isOpened()==false)  BAIL(err, NVCV_ERR_READ);
+
+    int width, height;
+    double fps;
+    width = (int)srcCaptures[i].get(cv::CAP_PROP_FRAME_WIDTH);
+    height = (int)srcCaptures[i].get(cv::CAP_PROP_FRAME_HEIGHT);
+    fps = srcCaptures[i].get(cv::CAP_PROP_FPS);
+
+    const int fourcc = StringToFourcc(codec);
+    char fileName[1024];
+    snprintf(fileName, sizeof(fileName), outfilePattern, i);
+    dstWriters[i].open(fileName, fourcc, fps, cv::Size2i(width,height), false);
+    if (dstWriters[i].isOpened() == false)  BAIL(err, NVCV_ERR_WRITE);
+  }
+
+  // Read in the first image, to determine the resolution for init()
+  BAIL_IF_FALSE(srcVideos.size() > 0, err, NVCV_ERR_MISSINGINPUT);
+  srcCaptures[0] >> ocv1;
+  srcCaptures[0].set(cv::CAP_PROP_POS_FRAMES, 0);  //resetting to first frame
+  if (!ocv1.data) {
+    printf("Cannot read video file \"%s\"\n", srcVideos[0]);
+    BAIL(err, NVCV_ERR_READ);
+  }
+  NVWrapperForCVMat(&ocv1, &nvx1);
+  srcWidth  = nvx1.width;
+  srcHeight = nvx1.height;
+
+  BAIL_IF_ERR(err = app.init(effectName, batchSize, mode, &nvx1)); // Init effect and buffers
+  BAIL_IF_ERR(err = NvVFX_SetU32(app._eff, NVVFX_MAX_NUMBER_STREAMS, numOfVideoStreams));
+  BAIL_IF_ERR(err = NvVFX_SetU32(app._eff, NVVFX_MODEL_BATCH, numOfVideoStreams>1?8:1));
+  BAIL_IF_ERR(err = NvVFX_Load(app._eff));
+
+  // Creating state objects, one per stream.
+  for (unsigned int i = 0; i < numOfVideoStreams; i++) {
+    NvVFX_StateObjectHandle state;
+    BAIL_IF_ERR(err = NvVFX_AllocateState(app._eff, &state));
+    arrayOfStates.push_back(state);
+  }
+
+  //Creating batch array to hold states
+  batchOfStates = (NvVFX_StateObjectHandle*)malloc(sizeof(NvVFX_StateObjectHandle) * batchSize);
+  if (batchOfStates == nullptr) {
+    err = NVCV_ERR_MEMORY;
+    goto bail;
+  }
+
+  dstHeight = app._dst.height / batchSize;
+  BAIL_IF_ERR(err = NvCVImage_Alloc(&nvx2, app._dst.width, dstHeight, NVCV_A, NVCV_U8, NVCV_CHUNKY, NVCV_CPU, 0));
+  CVWrapperForNvCVImage(&nvx2, &ocv2);
+  for(int j=0;;j++)
+  {
+    for (unsigned int i = 0; i < batchSize; i++) {
+      int capIdx = i%numOfVideoStreams; // interlacing frames from different video stream, but can in any order
+      srcCaptures[capIdx] >> ocv1;
+      if (ocv1.empty())  goto bail;
+      batchOfStates[i] = arrayOfStates[capIdx];
+
+      NVWrapperForCVMat(&ocv1, &nvx1);
+      if (!(nvx1.width == srcWidth && nvx1.height == srcHeight)) {
+        printf("Input video file \"%s\" %ux%u does not match %ux%u\n"
+               "Batching requires all video frames to be of the same size\n", srcVideos[i], nvx1.width, nvx1.height, srcWidth, srcHeight);
+        BAIL(err, NVCV_ERR_MISMATCH);
+      }
+      BAIL_IF_ERR(err = TransferToNthImage(i, &nvx1, &app._src, 1.f, app._stream, NULL));
+      ocv1.release();
+    }
+
+    // Run batch
+    BAIL_IF_ERR(err = NvVFX_SetU32(app._eff, NVVFX_BATCH_SIZE, (unsigned)batchSize));  // The batchSize can change every Run
+    BAIL_IF_ERR(err = NvVFX_SetStateObjectHandleArray(app._eff, NVVFX_STATE, batchOfStates));  // The batch of states can change every Run
+    BAIL_IF_ERR(err = NvVFX_Run(app._eff, 0));
+
+
+    for (unsigned int i = 0; i < batchSize; ++i) {
+      int writerIdx = i % numOfVideoStreams;
+      BAIL_IF_ERR(err = TransferFromNthImage(i, &app._dst, &nvx2, 1.0f, app._stream, NULL));
+      dstWriters[writerIdx] << ocv2;
+    }
+    // NvCVImage_Dealloc() is called in the destructors
+  } 
+bail:
+  // If DeallocateState fails, all memory allocated in the SDK returns to the heap when the effect handle is destroyed.
+  for (unsigned int i = 0; i < arrayOfStates.size(); i++) {
+    NvVFX_DeallocateState(app._eff, arrayOfStates[i]);
+  }
+  arrayOfStates.clear();
+
+  if (batchOfStates) {
+    free(batchOfStates);
+    batchOfStates = nullptr;
+  }
+  
+  for (auto& cap : srcCaptures) {
+    if (cap.isOpened())  cap.release();
+  }
+
+  for (auto& writer : dstWriters) {
+    if (writer.isOpened())  writer.release();
+  }
+
+  return err;
+}
+
+
+int main(int argc, char** argv) {
+  int         nErrs;
+  NvCV_Status vfxErr;
+
+  nErrs = ParseMyArgs(argc, argv);
+  if (nErrs)
+    return nErrs;
+
+  // If the outFile is missing a stream index
+  // insert one, assuming a period followed by a three-character extension
+  if (FLAG_outFile.empty())
+    FLAG_outFile = "BatchOut_%02u.mp4";
+  else if (std::string::npos == FLAG_outFile.find_first_of('%'))
+    FLAG_outFile.insert(FLAG_outFile.size() - 4, "_%02u"); 
+
+#ifdef NVVFX_FX_GREEN_SCREEN
+  vfxErr = BatchProcess(NVVFX_FX_GREEN_SCREEN, FLAG_mode, FLAG_inFiles, FLAG_outFile.c_str(), FLAG_codec);
+#elif defined(NVVFX_FX_GREEN_SCREEN_I)
+  vfxErr = BatchProcess(NVVFX_FX_GREEN_SCREEN_I, FLAG_mode, FLAG_inFiles, FLAG_outFile.c_str(), FLAG_codec);
+#endif
+  if (NVCV_SUCCESS != vfxErr) {
+    Usage();
+    printf("Error: %s\n", NvCV_GetErrorStringFromCode(vfxErr));
+    nErrs = (int)vfxErr;
+  }
+
+  return nErrs;
+}
--- a/samples/BatchEffectApp/BatchAigsEffectApp.exe
+++ b/samples/BatchEffectApp/BatchAigsEffectApp.exe
--- a/samples/BatchEffectApp/BatchDenoiseEffectApp.cpp
+++ b/samples/BatchEffectApp/BatchDenoiseEffectApp.cpp
@ -0,0 +1,345 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <cuda_runtime_api.h>
+#include "BatchUtilities.h"
+#include "nvCVOpenCV.h"
+#include "nvVideoEffects.h"
+#include "opencv2/opencv.hpp"
+
+#ifdef _MSC_VER
+  #define strcasecmp _stricmp
+#endif // _MSC_VER
+
+#define BAIL_IF_ERR(err)            do { if (0 != (err)) {                      goto bail; } } while(0)
+#define BAIL_IF_NULL(x, err, code)  do { if ((void*)(x) == NULL)  { err = code; goto bail; } } while(0)
+#define BAIL_IF_FALSE(x, err, code) do { if (!(x))                { err = code; goto bail; } } while(0)
+#define BAIL(err, code)             do {                            err = code; goto bail;   } while(0)
+
+
+bool                      FLAG_verbose        = false;
+float                     FLAG_strength       = 0.f,
+                          FLAG_scale          = 1.0;
+int                       FLAG_mode           = 0,
+                          FLAG_resolution     = 0,
+                          FLAG_batchSize      = 8;
+std::string               FLAG_outFile,
+                          FLAG_modelDir;
+std::vector<const char*>  FLAG_inFiles;
+
+// Set this when using OTA Updates
+// This path is used by nvVideoEffectsProxy.cpp to load the SDK dll
+// when using  OTA Updates
+char *g_nvVFXSDKPath = NULL;
+
+static bool GetFlagArgVal(const char *flag, const char *arg, const char **val) {
+  if (*arg != '-')
+    return false;
+  while (*++arg == '-')
+    continue;
+  const char *s = strchr(arg, '=');
+  if (s == NULL)  {
+    if (strcmp(flag, arg) != 0)
+      return false;
+    *val = NULL;
+    return true;
+  }
+  size_t n = s - arg;
+  if ((strlen(flag) != n) || (strncmp(flag, arg, n) != 0))
+    return false;
+  *val = s + 1;
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, std::string *val) {
+  const char *valStr;
+  if (!GetFlagArgVal(flag, arg, &valStr))
+    return false;
+  val->assign(valStr ? valStr : "");
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, bool *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success) {
+    *val = (valStr == NULL ||
+      strcasecmp(valStr, "true") == 0 ||
+      strcasecmp(valStr, "on")   == 0 ||
+      strcasecmp(valStr, "yes")  == 0 ||
+      strcasecmp(valStr, "1")    == 0
+      );
+  }
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, float *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtof(valStr, NULL);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, long *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtol(valStr, NULL, 10);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, int *val) {
+  long longVal;
+  bool success = GetFlagArgVal(flag, arg, &longVal);
+  if (success)
+    *val = (int)longVal;
+  return success;
+}
+
+static void Usage() {
+  printf(
+    "BatchDenoiseEffectApp [flags ...] inFile1 [ inFileN ...]\n"
+    "  where flags is:\n"
+    "  --out_file=<path>     output video files to be written (a pattern with one %%u or %%d), default \"BatchOut_%%02u.mp4\"\n"
+    "  --strength=<value>    strength of denoising [0-1]\n"
+    "  --model_dir=<path>    the path to the directory that contains the models\n"
+    "  --batchsize=<value>   size of the batch (default: 8)\n"
+    "  --verbose             verbose output\n"
+    "  and inFile1 ... are identically sized video files\n"
+  );
+}
+
+static int ParseMyArgs(int argc, char **argv) {
+  int errs = 0;
+  for (--argc, ++argv; argc--; ++argv) {
+    bool help;
+    const char *arg = *argv;
+    if (arg[0] == '-') {
+      if (arg[1] == '-') {                                      // double-dash
+        if (GetFlagArgVal("verbose",    arg, &FLAG_verbose)   ||
+            GetFlagArgVal("strength",   arg, &FLAG_strength)  ||
+            GetFlagArgVal("scale",      arg, &FLAG_scale)     ||
+            GetFlagArgVal("mode",       arg, &FLAG_mode)      ||
+            GetFlagArgVal("model_dir",  arg, &FLAG_modelDir)  ||
+            GetFlagArgVal("out_file",   arg, &FLAG_outFile)   ||
+            GetFlagArgVal("batch_size", arg, &FLAG_batchSize)
+        ) {
+          continue;
+        } else if (GetFlagArgVal("help", arg, &help)) {         // --help
+          Usage();
+          errs = 1;
+        }
+      }
+      else {                                                    // single dash
+        for (++arg; *arg; ++arg) {
+          if (*arg == 'v') {
+            FLAG_verbose = true;
+          } else {
+            printf("Unknown flag ignored: \"-%c\"\n", *arg);
+          }
+        }
+        continue;
+      }
+    }
+    else {                                                      // no dash
+      FLAG_inFiles.push_back(arg);
+    }
+  }
+  return errs;
+}
+
+
+class App {
+public:
+  NvVFX_Handle  _eff;
+  NvCVImage     _src, _stg, _dst;
+  CUstream      _stream;
+  unsigned      _batchSize;
+
+
+  App() : _eff(nullptr), _stream(0), _batchSize(0) {}
+  ~App() { NvVFX_DestroyEffect(_eff); if (_stream) NvVFX_CudaStreamDestroy(_stream);  }
+
+  NvCV_Status init(const char* effectName, unsigned batchSize, const NvCVImage *srcImg) {
+    NvCV_Status err = NVCV_ERR_UNIMPLEMENTED;
+ 
+    _batchSize = batchSize;
+    BAIL_IF_ERR(err = NvVFX_CreateEffect(effectName, &_eff));
+
+    BAIL_IF_ERR(err = AllocateBatchBuffer(&_src, _batchSize, srcImg->width, srcImg->height, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_CUDA, 1));   // 
+    BAIL_IF_ERR(err = AllocateBatchBuffer(&_dst, _batchSize, srcImg->width, srcImg->height, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_CUDA, 1));   // 
+    BAIL_IF_ERR(err = NvVFX_SetString(_eff, NVVFX_MODEL_DIRECTORY, FLAG_modelDir.c_str()));                                              // 
+
+
+    { // Set parameters.
+      NvCVImage nth;
+      BAIL_IF_ERR(err = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE,  NthImage(0, srcImg->height,              &_src, &nth)));  // Set the first of the batched images in ...
+      BAIL_IF_ERR(err = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, NthImage(0, _dst.height / _batchSize, &_dst, &nth)));  // ... and out
+      BAIL_IF_ERR(err = NvVFX_CudaStreamCreate(&_stream));
+      BAIL_IF_ERR(err = NvVFX_SetCudaStream(_eff, NVVFX_CUDA_STREAM, _stream));
+
+      BAIL_IF_ERR(err = NvVFX_Load(_eff));                                             
+    }
+
+  bail:
+    return err;
+  }
+};
+
+
+NvCV_Status BatchProcess(const char* effectName, const std::vector<const char*>& srcVideos, unsigned batchSize, const char *outfilePattern) {
+  NvCV_Status err       = NVCV_SUCCESS;
+  App         app;
+  cv::Mat     ocv1, ocv2;
+  NvCVImage   nvx1, nvx2;
+  unsigned    srcWidth, srcHeight, dstHeight;
+
+  void** arrayOfStates = nullptr;
+  void** batchOfStates = nullptr;
+  unsigned int stateSizeInBytes;
+
+  unsigned int numOfVideoStreams = static_cast<unsigned int>(srcVideos.size()); 
+  std::vector<cv::VideoCapture> srcCaptures(numOfVideoStreams);
+  std::vector<cv::VideoWriter> dstWriters(numOfVideoStreams);
+  for (unsigned int i = 0; i < numOfVideoStreams; i++) {
+    srcCaptures[i].open(srcVideos[i]);
+    if (srcCaptures[i].isOpened()==false)  BAIL(err, NVCV_ERR_READ);
+
+    int width, height;
+    double fps;
+    width = (int)srcCaptures[i].get(cv::CAP_PROP_FRAME_WIDTH);
+    height = (int)srcCaptures[i].get(cv::CAP_PROP_FRAME_HEIGHT);
+    fps = srcCaptures[i].get(cv::CAP_PROP_FPS);
+
+    const int fourcc_h264 = cv::VideoWriter::fourcc('H','2','6','4');
+    char fileName[1024];
+    snprintf(fileName, sizeof(fileName), outfilePattern, i);
+    dstWriters[i].open(fileName, fourcc_h264, fps, cv::Size2i(width,height));
+    if (dstWriters[i].isOpened() == false)  BAIL(err, NVCV_ERR_WRITE);
+  }
+
+  // Read in the first image, to determine the resolution for init()
+  BAIL_IF_FALSE(srcVideos.size() > 0, err, NVCV_ERR_MISSINGINPUT);
+  srcCaptures[0] >> ocv1;
+  srcCaptures[0].set(cv::CAP_PROP_POS_FRAMES, 0);  //resetting to first frame
+  if (!ocv1.data) {
+    printf("Cannot read video file \"%s\"\n", srcVideos[0]);
+    BAIL(err, NVCV_ERR_READ);
+  }
+  NVWrapperForCVMat(&ocv1, &nvx1);
+  srcWidth  = nvx1.width;
+  srcHeight = nvx1.height;
+
+  BAIL_IF_ERR(err = app.init(effectName, batchSize, &nvx1)); // Init effect and buffers
+
+  // Creating state objects, one per stream.
+  BAIL_IF_ERR(err = NvVFX_GetU32(app._eff, NVVFX_STATE_SIZE, &stateSizeInBytes));
+  arrayOfStates = (void**)calloc(numOfVideoStreams, sizeof(void*)); // allocating void* array of numOfVideoStreams elements
+  for (unsigned int i = 0; i < numOfVideoStreams; i++) {
+    cudaMalloc(&arrayOfStates[i], stateSizeInBytes);
+    cudaMemsetAsync(arrayOfStates[i], 0, stateSizeInBytes,app._stream);
+  }
+  //Creating batch array to hold states
+  batchOfStates = (void**)calloc(batchSize, sizeof(void*));
+
+  
+  dstHeight = app._dst.height / batchSize;
+  BAIL_IF_ERR(err = NvCVImage_Alloc(&nvx2, app._dst.width, dstHeight, ((app._dst.numComponents == 1) ? NVCV_Y : NVCV_BGR), NVCV_U8, NVCV_CHUNKY, NVCV_CPU, 0));
+  CVWrapperForNvCVImage(&nvx2, &ocv2);
+  for(int j=0;;j++)
+  {
+    for (unsigned int i = 0; i < batchSize; i++) {
+      int capIdx = i%numOfVideoStreams; // interlacing frames from different video stream, but can in any order
+      srcCaptures[capIdx] >> ocv1;
+      if (ocv1.empty())  goto bail;
+      batchOfStates[i] = arrayOfStates[capIdx];
+
+      NVWrapperForCVMat(&ocv1, &nvx1);
+      if (!(nvx1.width == srcWidth && nvx1.height == srcHeight)) {
+        printf("Input video file \"%s\" %ux%u does not match %ux%u\n"
+               "Batching requires all video frames to be of the same size\n", srcVideos[i], nvx1.width, nvx1.height, srcWidth, srcHeight);
+        BAIL(err, NVCV_ERR_MISMATCH);
+      }
+      BAIL_IF_ERR(err = TransferToNthImage(i, &nvx1, &app._src, 1.f / 255.f, app._stream, &app._stg));
+      ocv1.release();
+    }
+
+    // Run batch
+    BAIL_IF_ERR(err = NvVFX_SetU32(app._eff, NVVFX_BATCH_SIZE, (unsigned)batchSize));  // The batchSize can change every Run
+    BAIL_IF_ERR(err = NvVFX_SetObject(app._eff, NVVFX_STATE, (void*)batchOfStates));  // The batch of states can change every Run
+    BAIL_IF_ERR(err = NvVFX_Run(app._eff, 0));
+
+
+    for (unsigned int i = 0; i < batchSize; ++i) {
+      int writerIdx = i % numOfVideoStreams;
+      BAIL_IF_ERR(err = TransferFromNthImage(i, &app._dst, &nvx2, 255.f, app._stream, &app._stg));
+      dstWriters[writerIdx] << ocv2;
+    }
+    // NvCVImage_Dealloc() is called in the destructors
+  } 
+bail:
+  if (arrayOfStates) {
+    for (unsigned int i = 0; i < numOfVideoStreams; i++) {
+      if (arrayOfStates[i])  cudaFree(arrayOfStates[i]);
+    }
+    free(arrayOfStates);
+  }
+  if (batchOfStates)  free(batchOfStates);
+  
+  for (auto& cap : srcCaptures) {
+    if (cap.isOpened())  cap.release();
+  }
+  for (auto& writer : dstWriters) {
+    if (writer.isOpened())  writer.release();
+  }
+  return err;
+}
+
+
+int main(int argc, char** argv) {
+  int         nErrs;
+  NvCV_Status vfxErr;
+
+  nErrs = ParseMyArgs(argc, argv);
+  if (nErrs)
+    return nErrs;
+
+  if (FLAG_outFile.empty())
+    FLAG_outFile = "BatchOut_%02u.mp4";
+  else if (std::string::npos == FLAG_outFile.find_first_of('%'))
+    FLAG_outFile.insert(FLAG_outFile.size() - 4, "_%02u"); 
+
+  vfxErr = BatchProcess(NVVFX_FX_DENOISING, FLAG_inFiles, FLAG_batchSize, FLAG_outFile.c_str());
+  if (NVCV_SUCCESS != vfxErr) {
+    Usage();
+    printf("Error: %s\n", NvCV_GetErrorStringFromCode(vfxErr));
+    nErrs = (int)vfxErr;
+  }
+
+  return nErrs;
+}
--- a/samples/BatchEffectApp/BatchDenoiseEffectApp.exe
+++ b/samples/BatchEffectApp/BatchDenoiseEffectApp.exe
--- a/samples/BatchEffectApp/BatchEffectApp.cpp
+++ b/samples/BatchEffectApp/BatchEffectApp.cpp
@ -0,0 +1,400 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+
+#include "BatchUtilities.h"
+#include "nvCVOpenCV.h"
+#include "nvVideoEffects.h"
+#include "opencv2/opencv.hpp"
+
+#ifdef _MSC_VER
+  #define strcasecmp _stricmp
+#endif // _MSC_VER
+
+#define BAIL_IF_ERR(err)            do { if (0 != (err)) {                      goto bail; } } while(0)
+#define BAIL_IF_NULL(x, err, code)  do { if ((void*)(x) == NULL)  { err = code; goto bail; } } while(0)
+#define BAIL_IF_FALSE(x, err, code) do { if (!(x))                { err = code; goto bail; } } while(0)
+#define BAIL(err, code)             do {                            err = code; goto bail;   } while(0)
+
+
+bool                      FLAG_verbose        = false;
+float                     FLAG_strength       = 0.f,
+                          FLAG_scale          = 1.0;
+int                       FLAG_mode           = 0,
+                          FLAG_resolution     = 0;
+std::string               FLAG_outFile,
+                          FLAG_modelDir,
+                          FLAG_effect;
+std::vector<const char*>  FLAG_inFiles;
+
+// Set this when using OTA Updates
+// This path is used by nvVideoEffectsProxy.cpp to load the SDK dll
+// when using  OTA Updates
+char *g_nvVFXSDKPath = NULL;
+
+static bool GetFlagArgVal(const char *flag, const char *arg, const char **val) {
+  if (*arg != '-')
+    return false;
+  while (*++arg == '-')
+    continue;
+  const char *s = strchr(arg, '=');
+  if (s == NULL)  {
+    if (strcmp(flag, arg) != 0)
+      return false;
+    *val = NULL;
+    return true;
+  }
+  size_t n = s - arg;
+  if ((strlen(flag) != n) || (strncmp(flag, arg, n) != 0))
+    return false;
+  *val = s + 1;
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, std::string *val) {
+  const char *valStr;
+  if (!GetFlagArgVal(flag, arg, &valStr))
+    return false;
+  val->assign(valStr ? valStr : "");
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, bool *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success) {
+    *val = (valStr == NULL ||
+      strcasecmp(valStr, "true") == 0 ||
+      strcasecmp(valStr, "on")   == 0 ||
+      strcasecmp(valStr, "yes")  == 0 ||
+      strcasecmp(valStr, "1")    == 0
+      );
+  }
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, float *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtof(valStr, NULL);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, long *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtol(valStr, NULL, 10);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, int *val) {
+  long longVal;
+  bool success = GetFlagArgVal(flag, arg, &longVal);
+  if (success)
+    *val = (int)longVal;
+  return success;
+}
+
+static void Usage() {
+  printf(
+    "BatchEffectApp [flags ...] inFile1 [ inFileN ...]\n"
+    "  where flags is:\n"
+    "  --out_file=<path>     output image files to be written, default \"BatchOut_%%02u.png\"\n"
+    "  --effect=<effect>     the effect to apply\n"
+    "  --strength=<value>    strength of the upscaling effect, [0.0, 1.0]\n"
+    "  --scale=<scale>       scale factor to be applied: 1.5, 2, 3, maybe 1.3333333\n"
+    "  --resolution=<height> the desired height (either --scale or --resolution may be used)\n"
+    "  --mode=<mode>         mode 0 or 1\n"
+    "  --model_dir=<path>    the path to the directory that contains the models\n"
+    "  --verbose             verbose output\n"
+    "  and inFile1 ... are identically sized image files, e.g. png, jpg\n"
+  );
+
+  const char* cStr;
+  NvCV_Status err = NvVFX_GetString(nullptr, NVVFX_INFO, &cStr);
+  if (NVCV_SUCCESS != err)
+    printf("Cannot get effects: %s\n", NvCV_GetErrorStringFromCode(err));
+  printf("where effects are:\n%s", cStr);
+}
+
+static int ParseMyArgs(int argc, char **argv) {
+  int errs = 0;
+  for (--argc, ++argv; argc--; ++argv) {
+    bool help;
+    const char *arg = *argv;
+    if (arg[0] == '-') {
+      if (arg[1] == '-') {                                      // double-dash
+        if (GetFlagArgVal("verbose",    arg, &FLAG_verbose)   ||
+            GetFlagArgVal("effect",     arg, &FLAG_effect)    ||
+            GetFlagArgVal("strength",   arg, &FLAG_strength)  ||
+            GetFlagArgVal("scale",      arg, &FLAG_scale)     ||
+            GetFlagArgVal("mode",       arg, &FLAG_mode)      ||
+            GetFlagArgVal("model_dir",  arg, &FLAG_modelDir)  ||
+            GetFlagArgVal("out_file",   arg, &FLAG_outFile)
+        ) {
+          continue;
+        } else if (GetFlagArgVal("help", arg, &help)) {         // --help
+          Usage();
+          errs = 1;
+        }
+      }
+      else {                                                    // single dash
+        for (++arg; *arg; ++arg) {
+          if (*arg == 'v') {
+            FLAG_verbose = true;
+          } else {
+            printf("Unknown flag ignored: \"-%c\"\n", *arg);
+          }
+        }
+        continue;
+      }
+    }
+    else {                                                      // no dash
+      FLAG_inFiles.push_back(arg);
+    }
+  }
+  return errs;
+}
+
+static bool HasSuffix(const char *str, const char *suf) {
+  size_t  strSize = strlen(str),
+    sufSize = strlen(suf);
+  if (strSize < sufSize)
+    return false;
+  return (0 == strcasecmp(suf, str + strSize - sufSize));
+}
+
+static bool HasOneOfTheseSuffixes(const char *str, ...) {
+  bool matches = false;
+  const char *suf;
+  va_list ap;
+  va_start(ap, str);
+  while (nullptr != (suf = va_arg(ap, const char*))) {
+    if (HasSuffix(str, suf)) {
+      matches = true;
+      break;
+    }
+  }
+  va_end(ap);
+  return matches;
+}
+
+static bool IsLossyImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".jpg", ".jpeg", nullptr);
+}
+
+class App {
+public:
+  NvVFX_Handle  _eff;
+  NvCVImage     _src, _dst, _stg;
+  CUstream      _stream;
+  unsigned      _batchSize;
+
+  App() : _eff(nullptr), _stream(0), _batchSize(0) {}
+  ~App() { NvVFX_DestroyEffect(_eff); if (_stream) NvVFX_CudaStreamDestroy(_stream); }
+
+  NvCV_Status init(const char* effectName, unsigned batchSize, const NvCVImage *src) {
+    NvCV_Status err = NVCV_ERR_UNIMPLEMENTED;
+    unsigned    dw, dh;
+
+    if (FLAG_resolution) {
+      dw = FLAG_resolution * src->width / src->height,  // No rounding
+      dh = FLAG_resolution;
+    }
+    else {
+      dw = lroundf(src->width  * FLAG_scale),
+      dh = lroundf(src->height * FLAG_scale);
+    }
+
+    _batchSize = batchSize;
+    BAIL_IF_ERR(err = NvVFX_CreateEffect(effectName, &_eff));
+
+    if (!strcmp(effectName, NVVFX_FX_TRANSFER)) {
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_src, _batchSize, src->width, src->height, NVCV_RGB, NVCV_U8, NVCV_CHUNKY, NVCV_CUDA, 0));
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_dst, _batchSize, src->width, src->height, NVCV_RGB, NVCV_U8, NVCV_CHUNKY, NVCV_CUDA, 0));
+    }
+#ifdef NVVFX_FX_SR_UPSCALE
+    else if (!strcmp(effectName, NVVFX_FX_SR_UPSCALE)) {
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_src, _batchSize, src->width, src->height, NVCV_RGBA, NVCV_U8, NVCV_CHUNKY, NVCV_CUDA, 32)); // n*32, n>=0
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_dst, _batchSize, dw,         dh,          NVCV_RGBA, NVCV_U8, NVCV_CHUNKY, NVCV_CUDA, 32));
+      BAIL_IF_ERR(err = NvVFX_SetF32(_eff, NVVFX_STRENGTH, FLAG_strength));
+    }
+#endif // NVVFX_FX_SR_UPSCALE
+#ifdef NVVFX_FX_ARTIFACT_REDUCTION
+    else if (!strcmp(effectName, NVVFX_FX_ARTIFACT_REDUCTION)) {
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_src, _batchSize, src->width, src->height, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_CUDA, 1));
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_dst, _batchSize, src->width, src->height, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_CUDA, 1));
+      BAIL_IF_ERR(err = NvVFX_SetString(_eff, NVVFX_MODEL_DIRECTORY, FLAG_modelDir.c_str()));
+      BAIL_IF_ERR(err = NvVFX_SetU32(_eff, NVVFX_MODE, FLAG_mode));
+    }
+#endif // NVVFX_FX_ARTIFACT_REDUCTION
+#ifdef NVVFX_FX_SUPER_RES
+    else if (!strcmp(effectName, NVVFX_FX_SUPER_RES)) {
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_src, _batchSize, src->width, src->height, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_CUDA, 1));
+      BAIL_IF_ERR(err = AllocateBatchBuffer(&_dst, _batchSize, dw,         dh,          NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_CUDA, 1));
+      BAIL_IF_ERR(err = NvVFX_SetString(_eff, NVVFX_MODEL_DIRECTORY, FLAG_modelDir.c_str()));
+      BAIL_IF_ERR(err = NvVFX_SetU32(_eff, NVVFX_MODE, FLAG_mode));
+      BAIL_IF_ERR(err = NvVFX_SetF32(_eff, NVVFX_STRENGTH, FLAG_strength));
+    }
+#endif // NVVFX_FX_SUPER_RES
+    else {
+      BAIL(err, NVCV_ERR_UNIMPLEMENTED);
+    }
+
+    { // Set common parameters.
+      NvCVImage nth;
+      BAIL_IF_ERR(err = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE,  NthImage(0, src->height,              &_src, &nth)));  // Set the first of the batched images in ...
+      BAIL_IF_ERR(err = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, NthImage(0, _dst.height / _batchSize, &_dst, &nth)));  // ... and out
+      BAIL_IF_ERR(err = NvVFX_CudaStreamCreate(&_stream));
+      BAIL_IF_ERR(err = NvVFX_SetCudaStream(_eff, NVVFX_CUDA_STREAM, _stream));
+
+      // The batch size parameter is interpreted at two times:
+      // (1) during Load(), an appropriate batch-size model is chosen and loaded;
+      // (2) during Run(), the specified number of images in the batch are processed.
+      // The optimum throughput results from submitting a batch which is an integral multiple of the batched model
+      // chosen in Load().
+      //
+      // To request a particular batch-sized model, set the batch size before calling Load(),
+      // then get the batch size afterward to find out what batch-size model was chosen. If you do not specify the
+      // desired batchSize before calling Load(), it will choose the batchSize=1 model, since that is the default
+      // value for batchSize.
+      //
+      // After calling Load(), you can subsequently change the batch size to any number, even larger or smaller
+      // than the batch size of the chosen model. If a larger  batch size is chosen, smaller batches are submitted
+      // until the entire larger batch has been processed. In any event, the batch size should be set at least twice:
+      // once before Load() and once before the initial Run(). In many server applications, it is expected that
+      // the batch size is changing constantly as some videos complete and other are added, so setting the batchSize
+      // before every Run() call would be typical.
+      unsigned gotBatch;
+      BAIL_IF_ERR(err = NvVFX_SetU32(_eff, NVVFX_MODEL_BATCH, _batchSize)); // Try to choose a model tuned to this batch size
+      err = NvVFX_Load(_eff);                                               // This will load a new batched model -- a weighty process
+      if (!(NVCV_SUCCESS == err || NVCV_ERR_MODELSUBSTITUTION == err)) goto bail;
+      BAIL_IF_ERR(err = NvVFX_GetU32(_eff, NVVFX_MODEL_BATCH, &gotBatch));  // This tells us the batch size of the chosen model
+      if (FLAG_verbose && gotBatch != _batchSize) {
+        printf("Effect %s has no batch=%u model; processing in multiple batches of size %u%s instead\n",
+            effectName, _batchSize, gotBatch, (gotBatch > 1 ? " or less" : ""));
+        BAIL_IF_ERR(err = NvVFX_SetU32(_eff, NVVFX_BATCH_SIZE, _batchSize));  // This is lightweight, and usually done each Run
+      }
+    }
+
+  bail:
+    return err;
+  }
+};
+
+
+NvCV_Status BatchProcessImages(const char* effectName, const std::vector<const char*>& srcImages, const char *outfilePattern) {
+  NvCV_Status err       = NVCV_SUCCESS;
+  unsigned    batchSize = (unsigned)srcImages.size();
+  App         app;
+  cv::Mat     ocv;
+  NvCVImage   nvx;
+  unsigned    srcWidth, srcHeight, dstHeight, i;
+
+  // Read in the first image, to determine the resolution for init()
+  BAIL_IF_FALSE(srcImages.size() > 0, err, NVCV_ERR_MISSINGINPUT);
+  ocv = cv::imread(srcImages[0]);
+  if (!ocv.data) {
+    printf("Cannot read image file \"%s\"\n", srcImages[0]);
+    BAIL(err, NVCV_ERR_READ);
+  }
+  NVWrapperForCVMat(&ocv, &nvx);
+  srcWidth  = nvx.width;
+  srcHeight = nvx.height;
+  BAIL_IF_ERR(err = app.init(effectName, batchSize, &nvx)); // Init effect and buffers
+
+  // Transfer the first image to the batch src.
+  // Note, in all transfers, the scale factor only applies to floating-point pixels.
+  BAIL_IF_ERR(err = TransferToNthImage(0, &nvx, &app._src, 1.f/255.f, app._stream, &app._stg));
+  ocv.release();
+
+  // Read the remaining images and transfer to the batch src
+  for (i = 1; i < batchSize; ++i) {
+    ocv = cv::imread(srcImages[i]);
+    if (!ocv.data) {
+      printf("Cannot read image file \"%s\"\n", srcImages[i]);
+      BAIL(err, NVCV_ERR_READ);
+    }
+    NVWrapperForCVMat(&ocv, &nvx);
+    if (!(nvx.width == srcWidth && nvx.height == srcHeight)) {
+      printf("Input image file \"%s\" %ux%u does not match %ux%u\n", srcImages[i], nvx.width, nvx.height, srcWidth, srcHeight);
+      BAIL(err, NVCV_ERR_MISMATCH);
+    }
+    BAIL_IF_ERR(err = TransferToNthImage(i, &nvx, &app._src, 1.f / 255.f, app._stream, &app._stg));
+    ocv.release();
+  }
+
+  // Run batch
+  BAIL_IF_ERR(err = NvVFX_SetU32(app._eff, NVVFX_BATCH_SIZE, (unsigned)srcImages.size()));  // The batchSize can change every Run
+  BAIL_IF_ERR(err = NvVFX_Run(app._eff, 0));
+
+  // Retrieve and write images
+  dstHeight = app._dst.height / batchSize;
+  BAIL_IF_ERR(err = NvCVImage_Alloc(&nvx, app._dst.width, dstHeight, ((app._dst.numComponents == 1) ? NVCV_Y : NVCV_BGR), NVCV_U8, NVCV_CHUNKY, NVCV_CPU, 0));
+  CVWrapperForNvCVImage(&nvx, &ocv);
+  if(IsLossyImageFile(outfilePattern))
+    fprintf(stderr, "WARNING: JPEG output file format will reduce image quality\n");
+  for (i = 0; i < batchSize; ++i) {
+    char fileName[1024];
+    snprintf(fileName, sizeof(fileName), outfilePattern, i);
+    BAIL_IF_ERR(err = TransferFromNthImage(i, &app._dst, &nvx, 255.f, app._stream, &app._stg));
+    if (!cv::imwrite(fileName, ocv)) {
+      printf("Cannot write image file \"%s\"\n", fileName);
+      BAIL(err, NVCV_ERR_WRITE);
+    }
+  }
+  // NvCVImage_Dealloc() is called in the destructors
+
+bail:
+  return err;
+}
+
+
+int main(int argc, char** argv) {
+  int         nErrs;
+  NvCV_Status vfxErr;
+
+  nErrs = ParseMyArgs(argc, argv);
+  if (nErrs)
+    return nErrs;
+
+  if (FLAG_outFile.empty())
+    FLAG_outFile = "BatchOut_%02u.png";
+  else if (std::string::npos == FLAG_outFile.find_first_of('%'))
+    FLAG_outFile.insert(FLAG_outFile.size() - 4, "_%02u");  // assuming .xxx, i.e. .jpg, .png
+
+  vfxErr = BatchProcessImages(FLAG_effect.c_str(), FLAG_inFiles, FLAG_outFile.c_str());
+  if (NVCV_SUCCESS != vfxErr) {
+    printf("Error: %s\n", NvCV_GetErrorStringFromCode(vfxErr));
+    nErrs = (int)vfxErr;
+  }
+
+  return nErrs;
+}
--- a/samples/BatchEffectApp/BatchEffectApp.exe
+++ b/samples/BatchEffectApp/BatchEffectApp.exe
--- a/samples/BatchEffectApp/BatchUtilities.cpp
+++ b/samples/BatchEffectApp/BatchUtilities.cpp
@ -0,0 +1,156 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#include "BatchUtilities.h"
+
+
+/********************************************************************************
+ * AllocateBatchBuffer
+ ********************************************************************************/
+
+NvCV_Status AllocateBatchBuffer(NvCVImage *im, unsigned batchSize, unsigned width, unsigned height, NvCVImage_PixelFormat format,
+  NvCVImage_ComponentType type, unsigned layout, unsigned memSpace, unsigned alignment) {
+  return NvCVImage_Alloc(im, width, height * batchSize, format, type, layout, memSpace, alignment);
+}
+
+
+/********************************************************************************
+ * NthImage
+ ********************************************************************************/
+
+NvCVImage* NthImage(unsigned n, unsigned height, NvCVImage* full, NvCVImage* view) {
+  unsigned y = height;
+  if        (NVCV_PLANAR &  full->planar) {    // if not any of the chunky formats
+    if      (NVCV_PLANAR == full->planar)      y *= full->numComponents;
+    else if (NVCV_YUV444 == full->pixelFormat) y *= 3;
+    else if (NVCV_YUV422 == full->pixelFormat) y *= 2;
+    else if (NVCV_YUV420 == full->pixelFormat) y = y * 3 / 2;
+    else                                       y = 0;
+  }
+  NvCVImage_InitView(view, full, 0, y * n, full->width, height);
+  return view;
+}
+
+
+/********************************************************************************
+ * ComputeImageBytes
+ ********************************************************************************/
+
+int ComputeImageBytes(const NvCVImage* im) {
+  int imageBytes = im->pitch * (int)im->height;  // Correct for all chunky formats
+  if        (NVCV_PLANAR &  im->planar) {    // if not any of the chunky formats
+    if      (NVCV_PLANAR == im->planar)      imageBytes *= (int)im->numComponents;
+    else if (NVCV_YUV422 == im->pixelFormat) imageBytes *= 2;
+    else if (NVCV_YUV420 == im->pixelFormat) imageBytes = imageBytes * 3 / 2;
+    else                                     imageBytes = 0;
+  }
+  return imageBytes;
+}
+
+
+/********************************************************************************
+ * TransferToNthImage
+ ********************************************************************************/
+
+NvCV_Status TransferToNthImage(
+  unsigned n, const NvCVImage* src, NvCVImage* dstBatch, float scale, struct CUstream_st* stream, NvCVImage* tmp) {
+  NvCVImage nth;
+  return NvCVImage_Transfer(src, NthImage(n, src->height, dstBatch, &nth), scale, stream, tmp);
+}
+
+
+/********************************************************************************
+ * TransferFromNthImage
+ ********************************************************************************/
+
+NvCV_Status TransferFromNthImage(
+  unsigned n, const NvCVImage* srcBatch, NvCVImage* dst, float scale, struct CUstream_st* stream, NvCVImage* tmp) {
+  NvCVImage nth;
+  return NvCVImage_Transfer(NthImage(n, dst->height, const_cast<NvCVImage*>(srcBatch), &nth), dst, scale, stream, tmp);
+}
+
+
+/********************************************************************************
+ * TransferToBatchImage
+ * This illustrates the use of the pixel offset method, but the Nth image method could be used instead.
+ ********************************************************************************/
+
+NvCV_Status TransferToBatchImage(
+  unsigned batchSize, const NvCVImage** srcArray, NvCVImage* dstBatch, float scale, struct CUstream_st* stream, NvCVImage* tmp) {
+  NvCV_Status err = NVCV_SUCCESS;
+  NvCVImage nth;
+  (void)NthImage(0, (**srcArray).height, dstBatch, &nth);
+  int nextDst = ComputeImageBytes(&nth);
+  for (; batchSize--; ++srcArray, nth.pixels = (void*)((char*)nth.pixels + nextDst))
+    if (NVCV_SUCCESS != (err = NvCVImage_Transfer(*srcArray, &nth, scale, stream, tmp)))
+      break;
+  return err;
+}
+
+
+/********************************************************************************
+ * TransferFromBatchImage
+ * This illustrates the use of the pixel offset method, but the Nth image method could be used instead.
+ ********************************************************************************/
+
+NvCV_Status TransferFromBatchImage(
+  unsigned batchSize, const NvCVImage* srcBatch, NvCVImage** dstArray, float scale, struct CUstream_st* stream, NvCVImage* tmp) {
+  NvCV_Status err = NVCV_SUCCESS;
+  NvCVImage nth;
+  (void)NthImage(0, (**dstArray).height, const_cast<NvCVImage*>(srcBatch), &nth);
+  int nextSrc = ComputeImageBytes(&nth);
+  for (; batchSize--; nth.pixels = (void*)((char*)nth.pixels + nextSrc), ++dstArray)
+    if (NVCV_SUCCESS != (err = NvCVImage_Transfer(&nth, *dstArray, scale, stream, tmp)))
+      break;
+  return err;
+}
+
+
+/********************************************************************************
+ * TransferBatchImage
+ ********************************************************************************/
+
+NvCV_Status TransferBatchImage(const NvCVImage *srcBatch, NvCVImage *dstBatch,
+        unsigned imHeight, unsigned batchSize, float scale, struct CUstream_st *stream) {
+  NvCV_Status err = NVCV_SUCCESS;
+  NvCVImage   tmp;
+
+  if ((!(srcBatch->planar & NVCV_PLANAR) && !(dstBatch->planar & NVCV_PLANAR))  // both chunky
+    || (srcBatch->planar == NVCV_PLANAR && dstBatch->planar == NVCV_PLANAR && srcBatch->pixelFormat == dstBatch->pixelFormat)
+  ) {     // This is a fast transfer
+    err = NvCVImage_Transfer(srcBatch, dstBatch, scale, stream, &tmp);
+  }
+  else {  // This is guaranteed to be safe for all transfers
+    NvCVImage subSrc, subDst;
+    int       nextSrc, nextDst, n;
+    NvCVImage_InitView(&subSrc, const_cast<NvCVImage*>(srcBatch), 0, 0, srcBatch->width, imHeight);
+    NvCVImage_InitView(&subDst, dstBatch, 0, 0, dstBatch->width, imHeight);
+    nextSrc = ComputeImageBytes(&subSrc);
+    nextDst = ComputeImageBytes(&subDst);
+    for (n = batchSize; n--; subSrc.pixels = (char*)subSrc.pixels + nextSrc,
+                             subDst.pixels = (char*)subDst.pixels + nextDst)
+      if (NVCV_SUCCESS != (err = NvCVImage_Transfer(&subSrc, &subDst, scale, stream, &tmp)))
+        break;
+  }
+  return err;
+}
--- a/samples/BatchEffectApp/BatchUtilities.h
+++ b/samples/BatchEffectApp/BatchUtilities.h
@ -0,0 +1,127 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+
+#ifndef __BATCH_UTILITIES__
+#define __BATCH_UTILITIES__
+
+#include "nvCVImage.h"
+
+
+//! Allocate a batch buffer.
+//! \note All of the arguments are identical to that of NvCVImage_Alloc plus the batchSize.
+//! \param[out] im        the image to initialize.
+//! \param[in]  batchSize the number i=of images in the batch.
+//! \param[in]  width     the desired width  of each image, in pixels.
+//! \param[in]  height    the desired height of each image, in pixels.
+//! \param[in]  format    the format of the pixels.
+//! \param[in]  type      the type of the components of the pixels.
+//! \param[in]  layout    One of { NVCV_CHUNKY, NVCV_PLANAR } or one of the YUV layouts.
+//! \param[in]  memSpace  Location of the buffer: one of { NVCV_CPU, NVCV_CPU_PINNED, NVCV_GPU, NVCV_CUDA }
+//! \param[in]  alignment row byte alignment. Choose 0 or a power of 2.
+//!                       1: yields no gap whatsoever between scanlines;
+//!                       0: default alignment: 4 on CPU, and cudaMallocPitch's choice on GPU.
+//!                       Other common values are 16 or 32 for cache line size, 32 for texture alignment.
+//! \return NVCV_SUCCESS         if the operation was successful.
+//! \return NVCV_ERR_PIXELFORMAT if the pixel format is not accommodated.
+//! \return NVCV_ERR_MEMORY      if there is not enough memory to allocate the buffer.
+//! \note   this simply multiplies height by batchSize and calls NvCVImage_Alloc().
+NvCV_Status AllocateBatchBuffer(NvCVImage* im, unsigned batchSize, unsigned width, unsigned height,
+  NvCVImage_PixelFormat format, NvCVImage_ComponentType type, unsigned layout, unsigned memSpace, unsigned alignment);
+
+//! Initialize an image descriptor for the Nth image in a batch.
+//! \param[in]  n       the index of the desired image in the batch.
+//! \param[in]  height  the height of the image
+//! \param[in]  full    the batch image, or the 0th image in the batch.
+//! \param[out] view    the image descriptor to be initialized to a view of the nth image in the batch.
+//! \return     a pointer to the nth image view, facilitating the use of NthImage() inline as an argument to a function.
+//! \note       NvCVImage nth; NvVFX_SetImage(effect, NVVFX_INPUT_IMAGE, NthImage(0, height, batchIn, &nth));
+//!             is typically used to set the input image for a batch operation; similarly for output.
+NvCVImage* NthImage(unsigned n, unsigned height, NvCVImage* full, NvCVImage* view);
+
+//! Compute the byte offset between one image in a batch and the next.
+//! \param[in]  im  the image to be measured.
+//! \return the increment from one image to the next in a batch.
+//! \note this will be negative if the pitch is negative.
+int ComputeImageBytes(const NvCVImage* im);
+
+//! Transfer To the Nth Image in a Batched Image.
+//! \param[in]  n         the index of the batch image to modify.
+//! \param[in]  src       the source image.
+//! \param[in]  dstBatch  the batch destination image.
+//! \param[in]  scale     the pixel scale factor.
+//! \param[in]  stream    the CUDA stream on which to perform the transfer.
+//! \param[in]  tmp       the stage buffer (can be NULL, but can affect performance if needed).
+//! \return NVCV_SUCCESS if the operation was successful.
+NvCV_Status TransferToNthImage(
+  unsigned n, const NvCVImage* src, NvCVImage* dstBatch, float scale, struct CUstream_st* stream, NvCVImage* tmp);
+
+//! Transfer From the Nth Image in a Batched Image.
+//! \param[in]  n         the index of the batch image to read.
+//! \param[in]  srcBatch  the batch source image.
+//! \param[in]  dst       the destination image.
+//! \param[in]  scale     the pixel scale factor.
+//! \param[in]  stream    the CUDA stream on which to perform the transfer.
+//! \param[in]  tmp       the stage buffer (can be NULL, but can affect performance if needed).
+//! \return NVCV_SUCCESS if the operation was successful.
+NvCV_Status TransferFromNthImage(
+  unsigned n, const NvCVImage* srcBatch, NvCVImage* dst, float scale, struct CUstream_st* stream, NvCVImage* tmp);
+
+//! Transfer from a list of source images to a batch image.
+//! We use an array of image pointers rather than an array of images
+//! in order to more easily accommodate dynamically-changing batches.
+//! \param[in]  batchSize the number of source images to be transferred to the batch image.
+//! \param[in]  srcArray  array of pointers to the source images.
+//! \param[out] dstBatch  the batch destination image.
+//! \param[in]  scale     the pixel scale factor.
+//! \param[in]  stream    the CUDA stream.
+//! \param[in]  tmp       the stage buffer (can be NULL, but can affect performance if needed).
+//! \return NVCV_SUCCESS  if the operation was successful.
+NvCV_Status TransferToBatchImage(
+  unsigned batchSize, const NvCVImage** srcArray, NvCVImage* dstBatch, float scale, struct CUstream_st* stream, NvCVImage* tmp);
+
+//! Transfer from a batch image to a list of destination images.
+//! We use an array of image pointers rather than an array of images
+//! in order to more easily accommodate dynamically-changing batches.
+//! \param[in]  batchSize the number of destination images to be transferred from the batch image.
+//! \param[in]  srcBatch  the batch source image.
+//! \param[out] dstArray  array of pointers to the source images.
+//! \param[in]  scale     the pixel scale factor.
+//! \param[in]  stream    the CUDA stream.
+//! \param[in]  tmp       the stage buffer (can be NULL, but can affect performance if needed).
+//! \return NVCV_SUCCESS  if the operation was successful.
+NvCV_Status TransferFromBatchImage(
+  unsigned batchSize, const NvCVImage* srcBatch, NvCVImage** dstArray, float scale, struct CUstream_st* stream, NvCVImage* tmp);
+
+//! Transfer all images in a batch to another compatible batch of images.
+//! \param[in]  srcBatch  the batch source image.
+//! \param[out] dstBatch  the batch destination image.
+//! \param[in]  imHeight  the height of each image in the batch.
+//! \param[in]  batchSize the number of images in the batch.
+//! \param[in]  scale     the pixel scale factor.
+//! \param[in]  stream    the CUDA stream.
+//! \return NVCV_SUCCESS  if the operation was successful.
+NvCV_Status TransferBatchImage(const NvCVImage* srcBatch, NvCVImage* dstBatch,
+  unsigned imHeight, unsigned batchSize, float scale, struct CUstream_st* stream);
+
+
+#endif // __BATCH_UTILITIES__
--- a/samples/BatchEffectApp/CMakeLists.txt
+++ b/samples/BatchEffectApp/CMakeLists.txt
@ -0,0 +1,137 @@
+set(SOURCE_FILES
+    BatchEffectApp.cpp
+    BatchUtilities.cpp
+    ../../nvvfx/src/nvVideoEffectsProxy.cpp
+    ../../nvvfx/src/nvCVImageProxy.cpp)
+
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(BatchEffectApp ${SOURCE_FILES})
+target_include_directories(BatchEffectApp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../utils
+    )
+target_include_directories(BatchEffectApp PUBLIC
+    ${SDK_INCLUDES_PATH}
+    )
+
+if(MSVC)
+
+    target_link_libraries(BatchEffectApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(PATH_STR "PATH=%PATH%" ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "--show --in_file=\"${CMAKE_CURRENT_SOURCE_DIR}/../input/input_003054.jpg\" ")
+    set_target_properties(BatchEffectApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(BatchEffectApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
+
+#Batch denoise effect
+set(SOURCE_FILES
+    BatchDenoiseEffectApp.cpp
+    BatchUtilities.cpp
+    ../../nvvfx/src/nvVideoEffectsProxy.cpp
+    ../../nvvfx/src/nvCVImageProxy.cpp)
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(BatchDenoiseEffectApp ${SOURCE_FILES})
+target_include_directories(BatchDenoiseEffectApp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../utils
+    )
+target_include_directories(BatchDenoiseEffectApp PUBLIC
+    ${SDK_INCLUDES_PATH}
+    )
+
+if(MSVC)
+    target_link_libraries(BatchDenoiseEffectApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+    target_include_directories(BatchDenoiseEffectApp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/include)
+    
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(PATH_STR "PATH=%PATH%" ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "video1.mp4 video2.mp4 ")
+    set_target_properties(BatchDenoiseEffectApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(BatchDenoiseEffectApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
+
+#Batch aigs effect
+set(SOURCE_FILES
+    BatchAigsEffectApp.cpp
+    BatchUtilities.cpp
+    ../../nvvfx/src/nvVideoEffectsProxy.cpp
+    ../../nvvfx/src/nvCVImageProxy.cpp)
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(BatchAigsEffectApp ${SOURCE_FILES})
+target_include_directories(BatchAigsEffectApp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../utils
+    )
+target_include_directories(BatchAigsEffectApp PUBLIC
+    ${SDK_INCLUDES_PATH}
+    )
+
+if(MSVC)
+    target_link_libraries(BatchAigsEffectApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+    target_include_directories(BatchAigsEffectApp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/include)
+    
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(PATH_STR "PATH=%PATH%" ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "video1.mp4 video2.mp4 ")
+    set_target_properties(BatchAigsEffectApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(BatchAigsEffectApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
--- a/samples/BatchEffectApp/run.bat
+++ b/samples/BatchEffectApp/run.bat
@ -0,0 +1,9 @@
+SETLOCAL
+SET PATH=%PATH%;..\external\opencv\bin;
+SET IMAGE_LIST=..\input\LeFret_000900.jpg ..\input\LeFret_001400.jpg ..\input\LeFret_003400.jpg ..\input\LeFret_012300.jpg
+BatchEffectApp.exe --effect=ArtifactReduction --out_file=ArtifactReduction_%%04u.png %IMAGE_LIST%
+BatchEffectApp.exe --effect=SuperRes --out_file=SuperRes_%%04u.png --scale=1.5 %IMAGE_LIST%
+BatchEffectApp.exe --effect=Upscale --out_file=Upscale_%%04u.png --scale=1.5 %IMAGE_LIST% 
+
+SET VIDEO_LIST=..\input\input_0_100_frames.mp4 ..\input\input_100_200_frames.mp4
+BatchAigsEffectApp.exe --out_file=GreenScreen_%04u.mp4 %VIDEO_LIST%
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@ -0,0 +1,7 @@
+# Sample apps
+add_subdirectory(external)
+add_subdirectory(UpscalePipelineApp)  # Artifact Reduction and Upscale  
+add_subdirectory(VideoEffectsApp)     # Artifact Reduction and Super Res   
+add_subdirectory(AigsEffectApp)       # Green Screen 
+add_subdirectory(BatchEffectApp)
+add_subdirectory(DenoiseEffectApp)
--- a/samples/DenoiseEffectApp/CMakeLists.txt
+++ b/samples/DenoiseEffectApp/CMakeLists.txt
@ -0,0 +1,35 @@
+set(SOURCE_FILES DenoiseEffectApp.cpp ../../nvvfx/src/nvVideoEffectsProxy.cpp ../../nvvfx/src/nvCVImageProxy.cpp)
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(DenoiseEffectApp ${SOURCE_FILES})
+target_include_directories(DenoiseEffectApp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../utils)
+target_include_directories(DenoiseEffectApp PUBLIC ${SDK_INCLUDES_PATH})
+
+if(MSVC)
+    target_link_libraries(DenoiseEffectApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+    target_include_directories(DenoiseEffectApp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/include)
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(VFXSDK_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../../bin) # Also the location for CUDA/NVTRT/libcrypto
+    set(PATH_STR "PATH=%PATH%" ${VFXSDK_PATH_STR} ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "--model_dir=\"${CMAKE_CURRENT_SOURCE_DIR}/../../bin/models\" --show --webcam")
+    set_target_properties(DenoiseEffectApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(DenoiseEffectApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
--- a/samples/DenoiseEffectApp/DenoiseEffectApp.cpp
+++ b/samples/DenoiseEffectApp/DenoiseEffectApp.cpp
@ -0,0 +1,690 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <chrono>
+#include <string>
+#include <iostream>
+#include <cuda_runtime_api.h>
+#include "nvCVOpenCV.h"
+#include "nvVideoEffects.h"
+#include "opencv2/opencv.hpp"
+
+
+#ifdef _MSC_VER
+  #define strcasecmp _stricmp
+  #include <Windows.h>
+#else // !_MSC_VER
+  #include <sys/stat.h>
+#endif // _MSC_VER
+
+#define BAIL_IF_ERR(err)                    do { if (0 != (err)) {                      goto bail; } } while(0)
+#define BAIL_IF_NULL(x, err, code)          do { if ((void*)(x) == NULL)  { err = code; goto bail; } } while(0)
+#define NVCV_ERR_HELP 411
+
+#ifdef _WIN32
+  #define DEFAULT_CODEC "avc1"
+#else // !_WIN32
+  #define DEFAULT_CODEC "H264"
+#endif // _WIN32
+
+
+bool        FLAG_debug          = false,
+            FLAG_verbose        = false,
+            FLAG_show           = false,
+            FLAG_progress       = false,
+            FLAG_webcam         = false;
+float       FLAG_strength       = 0.f;
+std::string FLAG_codec          = DEFAULT_CODEC,
+            FLAG_camRes         = "1280x720",
+            FLAG_inFile,
+            FLAG_outFile,
+            FLAG_outDir,
+            FLAG_modelDir;
+
+
+// Set this when using OTA Updates
+// This path is used by nvVideoEffectsProxy.cpp to load the SDK dll
+// when using  OTA Updates
+char *g_nvVFXSDKPath = NULL;
+
+static bool GetFlagArgVal(const char *flag, const char *arg, const char **val) {
+  if (*arg != '-')
+    return false;
+  while (*++arg == '-')
+    continue;
+  const char *s = strchr(arg, '=');
+  if (s == NULL)  {
+    if (strcmp(flag, arg) != 0)
+      return false;
+    *val = NULL;
+    return true;
+  }
+  size_t n = s - arg;
+  if ((strlen(flag) != n) || (strncmp(flag, arg, n) != 0))
+    return false;
+  *val = s + 1;
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, std::string *val) {
+  const char *valStr;
+  if (!GetFlagArgVal(flag, arg, &valStr))
+    return false;
+  val->assign(valStr ? valStr : "");
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, bool *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success) {
+    *val = (valStr == NULL ||
+            strcasecmp(valStr, "true") == 0 ||
+            strcasecmp(valStr, "on")   == 0 ||
+            strcasecmp(valStr, "yes")  == 0 ||
+            strcasecmp(valStr, "1")    == 0
+      );
+  }
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, float *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtof(valStr, NULL);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, long *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtol(valStr, NULL, 10);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, int *val) {
+  long longVal;
+  bool success = GetFlagArgVal(flag, arg, &longVal);
+  if (success)
+    *val = (int)longVal;
+  return success;
+}
+
+static void Usage() {
+  printf(
+    "DenoiseEffectApp [args ...]\n"
+    "  where args is:\n"
+    "  --in_file=<path>           input file to be processed (can be an image but the best denoising performance is observed on videos)\n"
+    "  --webcam                   use a webcam as the input\n"
+    "  --out_file=<path>          output file to be written\n"
+    "  --show                     display the results in a window (for webcam, it is always true)\n"
+    "  --strength=<value>         strength of an effect [0-1]\n"
+    "  --model_dir=<path>         the path to the directory that contains the models\n"
+    "  --codec=<fourcc>           the fourcc code for the desired codec (default " DEFAULT_CODEC ")\n"
+    "  --progress                 show progress\n"
+    "  --verbose                  verbose output\n"
+    "  --debug                    print extra debugging information\n"
+  );
+}
+
+static int ParseMyArgs(int argc, char **argv) {
+  int errs = 0;
+  for (--argc, ++argv; argc--; ++argv) {
+    bool help;
+    const char *arg = *argv;
+    if (arg[0] != '-') {
+      continue;
+    } else if ((arg[1] == '-') &&
+      ( GetFlagArgVal("verbose",      arg, &FLAG_verbose)     ||
+        GetFlagArgVal("in",           arg, &FLAG_inFile)      ||
+        GetFlagArgVal("in_file",      arg, &FLAG_inFile)      ||
+        GetFlagArgVal("out",          arg, &FLAG_outFile)     ||
+        GetFlagArgVal("out_file",     arg, &FLAG_outFile)     ||
+        GetFlagArgVal("show",         arg, &FLAG_show)        ||
+        GetFlagArgVal("webcam",       arg, &FLAG_webcam)      ||
+        GetFlagArgVal("cam_res",      arg, &FLAG_camRes)      ||
+        GetFlagArgVal("strength",     arg, &FLAG_strength)    ||
+        GetFlagArgVal("model_dir",    arg, &FLAG_modelDir)    ||
+        GetFlagArgVal("codec",        arg, &FLAG_codec)       ||
+        GetFlagArgVal("progress",     arg, &FLAG_progress)    ||
+        GetFlagArgVal("debug",        arg, &FLAG_debug)
+        )) {
+      continue;
+    } else if (GetFlagArgVal("help", arg, &help)) {
+      return NVCV_ERR_HELP;
+    } else if (arg[1] != '-') {
+      for (++arg; *arg; ++arg) {
+        if (*arg == 'v') {
+          FLAG_verbose = true;
+        } else {
+          printf("Unknown flag ignored: \"-%c\"\n", *arg);
+        }
+      }
+      continue;
+    } else {
+      printf("Unknown flag ignored: \"%s\"\n", arg);
+    }
+  }
+  return errs;
+}
+
+static bool HasSuffix(const char *str, const char *suf) {
+  size_t  strSize = strlen(str),
+    sufSize = strlen(suf);
+  if (strSize < sufSize)
+    return false;
+  return (0 == strcasecmp(suf, str + strSize - sufSize));
+}
+
+static bool HasOneOfTheseSuffixes(const char *str, ...) {
+  bool matches = false;
+  const char *suf;
+  va_list ap;
+  va_start(ap, str);
+  while (nullptr != (suf = va_arg(ap, const char*))) {
+    if (HasSuffix(str, suf)) {
+      matches = true;
+      break;
+    }
+  }
+  va_end(ap);
+  return matches;
+}
+
+static bool IsImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".bmp", ".jpg", ".jpeg", ".png", nullptr);
+}
+
+static bool IsLossyImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".jpg", ".jpeg", nullptr);
+}
+
+static const char* DurationString(double sc) {
+  static char buf[16];
+  int         hr, mn;
+  hr = (int)(sc / 3600.);
+  sc -= hr * 3600.;
+  mn = (int)(sc / 60.);
+  sc -= mn * 60.;
+  snprintf(buf, sizeof(buf), "%02d:%02d:%06.3f", hr, mn, sc);
+  return buf;
+}
+
+struct VideoInfo {
+  int         codec;
+  int         width;
+  int         height;
+  double      frameRate;
+  long long   frameCount;
+};
+
+static void GetVideoInfo(cv::VideoCapture& reader, const char *fileName, VideoInfo *info) {
+  info->codec      =       (int)reader.get(cv::CAP_PROP_FOURCC);
+  info->width      =       (int)reader.get(cv::CAP_PROP_FRAME_WIDTH);
+  info->height     =       (int)reader.get(cv::CAP_PROP_FRAME_HEIGHT);
+  info->frameRate  =    (double)reader.get(cv::CAP_PROP_FPS);
+  info->frameCount = (long long)reader.get(cv::CAP_PROP_FRAME_COUNT);
+  if (FLAG_verbose)
+    printf(
+      "       file \"%s\"\n"
+      "      codec %.4s\n"
+      "      width %4d\n"
+      "     height %4d\n"
+      " frame rate %.3f\n"
+      "frame count %4lld\n"
+      "   duration %s\n",
+      fileName, (char*)&info->codec, info->width, info->height, info->frameRate, info->frameCount,
+      DurationString(info->frameCount / info->frameRate)
+    );
+}
+
+static int StringToFourcc(const std::string& str) {
+  union chint { int i; char c[4]; };
+  chint x = { 0 };
+  for (int n = (str.size() < 4) ? (int)str.size() : 4; n--;)
+    x.c[n] = str[n];
+  return x.i;
+}
+
+struct FXApp {
+  enum Err {
+    errQuit               = +1,                         // Application errors
+    errFlag               = +2,
+    errRead               = +3,
+    errWrite              = +4,
+    errNone               = NVCV_SUCCESS,               // Video Effects SDK errors
+    errGeneral            = NVCV_ERR_GENERAL,
+    errUnimplemented      = NVCV_ERR_UNIMPLEMENTED,
+    errMemory             = NVCV_ERR_MEMORY,
+    errEffect             = NVCV_ERR_EFFECT,
+    errSelector           = NVCV_ERR_SELECTOR,
+    errBuffer             = NVCV_ERR_BUFFER,
+    errParameter          = NVCV_ERR_PARAMETER,
+    errMismatch           = NVCV_ERR_MISMATCH,
+    errPixelFormat        = NVCV_ERR_PIXELFORMAT,
+    errModel              = NVCV_ERR_MODEL,
+    errLibrary            = NVCV_ERR_LIBRARY,
+    errInitialization     = NVCV_ERR_INITIALIZATION,
+    errFileNotFound       = NVCV_ERR_FILE,
+    errFeatureNotFound    = NVCV_ERR_FEATURENOTFOUND,
+    errMissingInput       = NVCV_ERR_MISSINGINPUT,
+    errResolution         = NVCV_ERR_RESOLUTION,
+    errUnsupportedGPU     = NVCV_ERR_UNSUPPORTEDGPU,
+    errWrongGPU           = NVCV_ERR_WRONGGPU,
+    errUnsupportedDriver  = NVCV_ERR_UNSUPPORTEDDRIVER,
+    errCudaMemory         = NVCV_ERR_CUDA_MEMORY,       // CUDA errors
+    errCudaValue          = NVCV_ERR_CUDA_VALUE,
+    errCudaPitch          = NVCV_ERR_CUDA_PITCH,
+    errCudaInit           = NVCV_ERR_CUDA_INIT,
+    errCudaLaunch         = NVCV_ERR_CUDA_LAUNCH,
+    errCudaKernel         = NVCV_ERR_CUDA_KERNEL,
+    errCudaDriver         = NVCV_ERR_CUDA_DRIVER,
+    errCudaUnsupported    = NVCV_ERR_CUDA_UNSUPPORTED,
+    errCudaIllegalAddress = NVCV_ERR_CUDA_ILLEGAL_ADDRESS,
+    errCuda               = NVCV_ERR_CUDA,
+  };
+
+  FXApp()   { _eff = nullptr; _effectName = nullptr; _inited = false; _showFPS = false; _progress = false;
+              _show = false; _enableEffect = true, _drawVisualization = true, _framePeriod = 0.f; }
+  ~FXApp()  { NvVFX_DestroyEffect(_eff); }
+
+  void          setShow(bool show) { _show = show; }
+  Err           createEffect(const char *effectSelector, const char *modelDir);
+  void          destroyEffect();
+  NvCV_Status   allocBuffers(unsigned width, unsigned height);
+  NvCV_Status   allocTempBuffers();
+  Err           processImage(const char *inFile, const char *outFile);
+  Err           processMovie(const char *inFile, const char *outFile);
+  Err           initCamera(cv::VideoCapture& cap);
+  Err           processKey(int key);
+  void          drawFrameRate(cv::Mat& img);
+  void          drawEffectStatus(cv::Mat& img);
+  Err           appErrFromVfxStatus(NvCV_Status status)  { return (Err)status; }
+  const char*   errorStringFromCode(Err code);
+
+  NvVFX_Handle  _eff;
+  cv::Mat       _srcImg;
+  cv::Mat       _dstImg;
+  NvCVImage     _srcGpuBuf;
+  NvCVImage     _dstGpuBuf;
+  NvCVImage     _srcVFX;
+  NvCVImage     _dstVFX;
+  NvCVImage     _tmpVFX;  // We use the same temporary buffer for source and dst, since it auto-shapes as needed
+  bool          _show;
+  bool          _inited;
+  bool          _showFPS;
+  bool          _progress;
+  bool          _enableEffect;
+  bool          _drawVisualization;
+  const char*   _effectName;
+  float         _framePeriod;
+  std::chrono::high_resolution_clock::time_point _lastTime;
+};
+
+const char* FXApp::errorStringFromCode(Err code) {
+  struct LutEntry { Err code; const char *str; };
+  static const LutEntry lut[] = {
+    { errRead,    "There was a problem reading a file"                    },
+    { errWrite,   "There was a problem writing a file"                    },
+    { errQuit,    "The user chose to quit the application"                },
+    { errFlag,    "There was a problem with the command-line arguments"   },
+  };
+  if ((int)code <= 0) return NvCV_GetErrorStringFromCode((NvCV_Status)code);
+  for (const LutEntry *p = lut; p != &lut[sizeof(lut) / sizeof(lut[0])]; ++p)
+    if (p->code == code) return p->str;
+  return "UNKNOWN ERROR";
+}
+
+void FXApp::drawFrameRate(cv::Mat &img) {
+  const float timeConstant = 16.f;
+  std::chrono::high_resolution_clock::time_point now = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<float> dur = std::chrono::duration_cast<std::chrono::duration<float>>(now - _lastTime);
+  float t = dur.count();
+  if (0.f < t && t < 100.f) {
+    if (_framePeriod)
+      _framePeriod += (t - _framePeriod) * (1.f / timeConstant);  // 1 pole IIR filter
+    else
+      _framePeriod = t;
+    if (_showFPS) {
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%.1f", 1. / _framePeriod);
+      cv::putText(img, buf, cv::Point(10, img.rows - 10), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(255, 255, 255), 1);
+    }
+  } else {            // Ludicrous time interval; reset
+    _framePeriod = 0.f;  // WAKE UP
+  }
+  _lastTime = now;
+}
+
+FXApp::Err FXApp::processKey(int key) {
+  static const int ESC_KEY = 27;
+  switch (key) {
+  case 'Q': case 'q': case ESC_KEY:
+    return errQuit;
+  case 'f': case 'F':
+    _showFPS = !_showFPS;
+    break;
+  case 'p': case 'P': case '%':
+    _progress = !_progress;
+  case 'e': case 'E':
+      _enableEffect = !_enableEffect;
+    break;
+  case 'd': case'D':
+    if (FLAG_webcam)
+      _drawVisualization = !_drawVisualization;
+    break;
+  default:
+    break;
+  }
+  return errNone;
+}
+
+FXApp::Err FXApp::initCamera(cv::VideoCapture& cap) {
+  const int camIndex = 0;
+  cap.open(camIndex);
+  if (!FLAG_camRes.empty()) {
+    int camWidth, camHeight, n;
+    n = sscanf(FLAG_camRes.c_str(), "%d%*[xX]%d", &camWidth, &camHeight);
+    switch (n) {
+    case 2:
+      break;  // We have read both width and height
+    case 1:
+      camHeight = camWidth;
+      camWidth = (int)(camHeight * (16. / 9.) + .5);
+      break;
+    default:
+      camHeight = 0;
+      camWidth = 0;
+      break;
+    }
+
+    if (camWidth) cap.set(cv::CAP_PROP_FRAME_WIDTH, camWidth);
+    if (camHeight) cap.set(cv::CAP_PROP_FRAME_HEIGHT, camHeight);
+    int actualCamWidth = cap.get(cv::CAP_PROP_FRAME_WIDTH);
+    int actualCamHeight = cap.get(cv::CAP_PROP_FRAME_HEIGHT);
+    if (camWidth != actualCamWidth || camHeight != actualCamHeight) {
+      printf("The requested resolution of %d x %d is not available and has been subsituted by %d x %d.\n", camWidth, camHeight, actualCamWidth, actualCamHeight);
+    }
+  }
+  return errNone;
+}
+
+void FXApp::drawEffectStatus(cv::Mat& img) {
+  char buf[32];
+  snprintf(buf, sizeof(buf), "Effect: %s", _enableEffect ? "on" : "off");
+  cv::putText(img, buf, cv::Point(10, img.rows - 40), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(255, 255, 255), 1);
+}
+
+FXApp::Err FXApp::createEffect(const char *effectSelector, const char *modelDir) {
+  NvCV_Status vfxErr;
+  BAIL_IF_ERR(vfxErr = NvVFX_CreateEffect(effectSelector, &_eff));
+  _effectName = effectSelector;
+  if (modelDir[0] != '\0'){
+    BAIL_IF_ERR(vfxErr = NvVFX_SetString(_eff, NVVFX_MODEL_DIRECTORY, modelDir));
+  }
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+void FXApp::destroyEffect() {
+  NvVFX_DestroyEffect(_eff);
+  _eff = nullptr;
+}
+
+// Allocate one temp buffer to be used for input and output. Reshaping of the temp buffer in NvCVImage_Transfer() is done automatically,
+// and is very low overhead. We expect the destination to be largest, so we allocate that first to minimize reallocs probablistically.
+// Then we Realloc for the source to get the union of the two.
+// This could alternately be done at runtime by feeding in an empty temp NvCVImage, but there are advantages to allocating all memory at load time.
+NvCV_Status FXApp::allocTempBuffers() {
+  NvCV_Status vfxErr;
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(  &_tmpVFX, _dstVFX.width, _dstVFX.height, _dstVFX.pixelFormat, _dstVFX.componentType, _dstVFX.planar, NVCV_GPU, 0));
+  BAIL_IF_ERR(vfxErr = NvCVImage_Realloc(&_tmpVFX, _srcVFX.width, _srcVFX.height, _srcVFX.pixelFormat, _srcVFX.componentType, _srcVFX.planar, NVCV_GPU, 0));
+bail:
+  return vfxErr;
+}
+
+NvCV_Status FXApp::allocBuffers(unsigned width, unsigned height) {
+  NvCV_Status  vfxErr = NVCV_SUCCESS;
+
+  if (_inited)
+    return NVCV_SUCCESS;
+
+  if (!_srcImg.data) {
+    _srcImg.create(height, width, CV_8UC3);                                                                                        // src CPU
+    BAIL_IF_NULL(_srcImg.data, vfxErr, NVCV_ERR_MEMORY);
+  }
+
+  _dstImg.create(_srcImg.rows, _srcImg.cols, _srcImg.type()); // 
+  BAIL_IF_NULL(_dstImg.data, vfxErr, NVCV_ERR_MEMORY); // 
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_srcGpuBuf, _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // src GPU
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_dstGpuBuf, _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1)); //dst GPU
+
+  NVWrapperForCVMat(&_srcImg, &_srcVFX);      // _srcVFX is an alias for _srcImg
+  NVWrapperForCVMat(&_dstImg, &_dstVFX);      // _dstVFX is an alias for _dstImg
+
+  //#define ALLOC_TEMP_BUFFERS_AT_RUN_TIME    // Deferring temp buffer allocation is easier
+  #ifndef ALLOC_TEMP_BUFFERS_AT_RUN_TIME      // Allocating temp buffers at load time avoids run time hiccups
+    BAIL_IF_ERR(vfxErr = allocTempBuffers()); // This uses _srcVFX and _dstVFX and allocates one buffer to be a temporary for src and dst
+  #endif // ALLOC_TEMP_BUFFERS_AT_RUN_TIME
+
+  _inited = true;
+
+bail:
+  return vfxErr;
+}
+
+FXApp::Err FXApp::processImage(const char *inFile, const char *outFile) {
+  CUstream      stream  = 0;
+  NvCV_Status   vfxErr;
+
+  void* state = nullptr;
+  void* stateArray[1];
+
+  if (!_eff)
+    return errEffect;
+  _srcImg = cv::imread(inFile);
+  if (!_srcImg.data)
+    return errRead;
+
+  BAIL_IF_ERR(vfxErr = allocBuffers(_srcImg.cols, _srcImg.rows));
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_srcGpuBuf, 1.f / 255.f, stream, &_tmpVFX)); // _srcVFX--> _tmpVFX --> _srcGpuBuf
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE, &_srcGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, &_dstGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetF32(_eff, NVVFX_STRENGTH, FLAG_strength));
+
+  unsigned int stateSizeInBytes;
+  BAIL_IF_ERR(vfxErr = NvVFX_GetU32(_eff, NVVFX_STATE_SIZE, &stateSizeInBytes));
+  cudaMalloc(&state, stateSizeInBytes);
+  cudaMemsetAsync(state, 0, stateSizeInBytes, stream);
+  stateArray[0] = state;
+  BAIL_IF_ERR(vfxErr = NvVFX_SetObject(_eff, NVVFX_STATE, (void*)stateArray));
+
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_eff));
+  BAIL_IF_ERR(vfxErr = NvVFX_Run(_eff, 0));
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_dstGpuBuf, &_dstVFX, 255.f, stream, &_tmpVFX));
+
+  if (outFile && outFile[0]) {
+    if(IsLossyImageFile(outFile))
+      fprintf(stderr, "WARNING: JPEG output file format will reduce image quality\n");
+    if (!cv::imwrite(outFile, _dstImg)) {
+      printf("Error writing: \"%s\"\n", outFile);
+      return errWrite;
+    }
+  }
+  if (_show) {
+    cv::imshow("Output", _dstImg);
+    cv::waitKey(3000);
+  }
+bail:
+  if (state)  cudaFree(state); // release state memory
+  return appErrFromVfxStatus(vfxErr);
+}
+
+FXApp::Err FXApp::processMovie(const char *inFile, const char *outFile) {
+  const int       fourcc_h264 = cv::VideoWriter::fourcc('H','2','6','4');
+  CUstream        stream      = 0;
+  FXApp::Err      appErr      = errNone;
+  bool            ok;
+  cv::VideoCapture reader;
+  cv::VideoWriter writer;
+  NvCV_Status     vfxErr;
+  unsigned        frameNum;
+  VideoInfo       info;
+
+  void* state = nullptr;
+  void* stateArray[1];
+
+  if (inFile && !inFile[0]) inFile = nullptr;  // Set file paths to NULL if zero length
+
+  if (!FLAG_webcam && inFile) {
+    reader.open(inFile);
+  } else {
+    appErr = initCamera(reader);
+    if (appErr != errNone)
+      return appErr;
+  }
+
+  if (!reader.isOpened()) {
+    if (!FLAG_webcam) printf("Error: Could not open video: \"%s\"\n", inFile);
+    else              printf("Error: Webcam not found\n");
+    return errRead;
+  }
+
+  GetVideoInfo(reader, (inFile ? inFile : "webcam"), &info);
+  if (!(fourcc_h264 == info.codec || cv::VideoWriter::fourcc('a', 'v', 'c', '1') == info.codec)) // avc1 is alias for h264
+    printf("Filters only target H264 videos, not %.4s\n", (char*)&info.codec);
+
+  BAIL_IF_ERR(vfxErr = allocBuffers(info.width, info.height));
+
+  if (outFile && !outFile[0]) outFile = nullptr;
+  if (outFile) {
+    ok = writer.open(outFile, StringToFourcc(FLAG_codec), info.frameRate, cv::Size(_dstVFX.width, _dstVFX.height));
+    if (!ok) {
+      printf("Cannot open \"%s\" for video writing\n", outFile);
+      outFile = nullptr;
+      if (!_show)
+        return errWrite;
+    }
+  }
+
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE,  &_srcGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, &_dstGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetF32(_eff, NVVFX_STRENGTH, FLAG_strength));
+
+  unsigned int stateSizeInBytes;
+  BAIL_IF_ERR(vfxErr = NvVFX_GetU32(_eff, NVVFX_STATE_SIZE, &stateSizeInBytes));
+  cudaMalloc(&state, stateSizeInBytes);
+  cudaMemsetAsync(state, 0, stateSizeInBytes, stream);
+  stateArray[0] = state;
+  BAIL_IF_ERR(vfxErr = NvVFX_SetObject(_eff, NVVFX_STATE, (void*)stateArray));
+
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_eff));
+
+  for (frameNum = 0; reader.read(_srcImg); frameNum++) {
+    if (_enableEffect) {
+      BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_srcGpuBuf, 1.f / 255.f, stream, &_tmpVFX));
+      BAIL_IF_ERR(vfxErr = NvVFX_Run(_eff, 0));
+      BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_dstGpuBuf, &_dstVFX, 255.f, stream, &_tmpVFX));
+    } else {
+      BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_dstVFX, 1.f, stream, &_tmpVFX));
+      cudaMemsetAsync(state, 0, stateSizeInBytes, stream);// reset state by setting to 0
+    }
+
+    if (outFile)
+      writer.write(_dstImg);
+
+    if (_show) {
+      if (_drawVisualization)  drawEffectStatus(_dstImg);
+      drawFrameRate(_dstImg);
+      cv::imshow("Output", _dstImg);
+      int key= cv::waitKey(1);
+      if (key > 0) {
+        appErr = processKey(key);
+        if (errQuit == appErr)
+          break;
+      }
+    }
+    if (_progress)
+      fprintf(stderr, "\b\b\b\b%3.0f%%", 100.f * frameNum / info.frameCount);
+  }
+
+  if (_progress) fprintf(stderr, "\n");
+  reader.release();
+  if (outFile)
+    writer.release();
+bail:
+  if (state)  cudaFree(state); // release state memory
+  return appErrFromVfxStatus(vfxErr);
+}
+
+int main(int argc, char **argv) {
+  FXApp::Err  fxErr = FXApp::errNone;
+  int         nErrs;
+  FXApp       app;
+
+  nErrs = ParseMyArgs(argc, argv);
+  if (nErrs)
+    std::cerr << nErrs << " command line syntax problems\n";
+
+  if (FLAG_webcam) {
+    // If webcam is on, enable showing the results and turn off displaying the progress
+    if (FLAG_progress) FLAG_progress = !FLAG_progress;
+    if (!FLAG_show)     FLAG_show = !FLAG_show;
+  }
+  if (FLAG_inFile.empty() && !FLAG_webcam) {
+    std::cerr << "Please specify --in_file=XXX or --webcam=true\n";
+    ++nErrs;
+  }
+  if (FLAG_outFile.empty() && !FLAG_show) {
+    std::cerr << "Please specify --out_file=XXX or --show\n";
+    ++nErrs;
+  }
+  app._progress = FLAG_progress;
+  app.setShow(FLAG_show);
+
+  if (nErrs) {
+    Usage();
+    fxErr = FXApp::errFlag;
+  }
+  else {
+    fxErr = app.createEffect(NVVFX_FX_DENOISING, FLAG_modelDir.c_str());
+    if (FXApp::errNone != fxErr) {
+      std::cerr << "Error creating effect\n";
+    }
+    else {
+      if (IsImageFile(FLAG_inFile.c_str()))
+        fxErr = app.processImage(FLAG_inFile.c_str(), FLAG_outFile.c_str());
+      else
+        fxErr = app.processMovie(FLAG_inFile.c_str(), FLAG_outFile.c_str());
+    }
+  }
+
+  if (fxErr)
+    std::cerr << "Error: " << app.errorStringFromCode(fxErr) << std::endl;
+  return (int)fxErr;
+}
--- a/samples/DenoiseEffectApp/DenoiseEffectApp.exe
+++ b/samples/DenoiseEffectApp/DenoiseEffectApp.exe
--- a/samples/DenoiseEffectApp/run.bat
+++ b/samples/DenoiseEffectApp/run.bat
@ -0,0 +1,4 @@
+SETLOCAL
+SET PATH=%PATH%;..\external\opencv\bin;
+DenoiseEffectApp.exe --webcam --strength=0 --show
+DenoiseEffectApp.exe --webcam --strength=1 --show
--- a/samples/NVOSSLicense.txt
+++ b/samples/NVOSSLicense.txt
@ -0,0 +1,27 @@
+The contents of this folder except the folder 'external' are
+governed by the MIT license. For 3rd party OSS S/W attributions,
+please refer to external/ThirdPartyLicenses.txt file.
+
+Copyright (C) 2019, NVIDIA Corporation, all rights reserved.
+
+                       MIT License
+					   
+Permission is hereby granted, free of charge, to any person 
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+ 
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+ 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/samples/UpscalePipelineApp/CMakeLists.txt
+++ b/samples/UpscalePipelineApp/CMakeLists.txt
@ -0,0 +1,40 @@
+set(SOURCE_FILES UpscalePipeline.cpp ../../nvvfx/src/nvVideoEffectsProxy.cpp ../../nvvfx/src/nvCVImageProxy.cpp)
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(UpscalePipelineApp ${SOURCE_FILES})
+target_include_directories(UpscalePipelineApp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../utils
+    )
+target_include_directories(UpscalePipelineApp PUBLIC
+    ${SDK_INCLUDES_PATH}
+    )
+
+if(MSVC)
+    target_link_libraries(UpscalePipelineApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(VFXSDK_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../../bin) # Also the location for CUDA/NVTRT/libcrypto
+    set(PATH_STR "PATH=%PATH%" ${VFXSDK_PATH_STR} ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "--model_dir=\"${CMAKE_CURRENT_SOURCE_DIR}/../../bin/models\" --show --resolution=1440 --in_file=\"${CMAKE_CURRENT_SOURCE_DIR}/../input/input1.jpg\"")
+    set_target_properties(UpscalePipelineApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(UpscalePipelineApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
--- a/samples/UpscalePipelineApp/UpscalePipeline.cpp
+++ b/samples/UpscalePipelineApp/UpscalePipeline.cpp
@ -0,0 +1,652 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <chrono>
+#include <string>
+#include <iostream>
+
+#include "nvCVOpenCV.h"
+#include "nvVideoEffects.h"
+#include "opencv2/opencv.hpp"
+
+/*########################################################################################################################
+# This application demonstrates the pipelining of two NvVFX_API video effects through a common use case whereby an image
+# or image sequence is fed first through the Artifact Removal filter, and then through the Super Resolution filter,
+# to produce an upscaled, video compression artifact-reduced version of the image/image sequence.
+# This is likely to be useful when dealing with low-quality input video bitstreams,
+# such as during game or movie streaming in a congested network environment.
+# While only the specific use case of pipelining the Artifact Removal and Super Resolution
+# filters is supported here to avoid undue code complexity, the basic method and structure shown here can be applied
+# to pipeline an arbitrary sequence of NvVFX_API video effects.
+##########################################################################################################################*/
+
+#ifdef _MSC_VER
+  #define strcasecmp _stricmp
+  #include <Windows.h>
+#else // !_MSC_VER
+  #include <sys/stat.h>
+#endif // _MSC_VER
+
+#define BAIL_IF_ERR(err)            do { if (0 != (err))          { goto bail;             } } while(0)
+#define BAIL_IF_NULL(x, err, code)  do { if ((void*)(x) == NULL)  { err = code; goto bail; } } while(0)
+#define NVCV_ERR_HELP 411
+
+#ifdef _WIN32
+#define DEFAULT_CODEC "avc1"
+#else // !_WIN32
+#define DEFAULT_CODEC "H264"
+#endif // _WIN32
+
+
+bool        FLAG_debug               = false,
+            FLAG_verbose             = false,
+            FLAG_show                = false,
+            FLAG_progress            = false;
+int         FLAG_resolution          = 0,
+            FLAG_arMode              = 0;
+float       FLAG_upscaleStrength     = 0.2f;
+std::string FLAG_codec               = DEFAULT_CODEC,
+            FLAG_inFile,
+            FLAG_outFile,
+            FLAG_outDir,
+            FLAG_modelDir;
+
+// Set this when using OTA Updates
+// This path is used by nvVideoEffectsProxy.cpp to load the SDK dll
+// when using  OTA Updates
+char *g_nvVFXSDKPath = NULL;
+
+static bool GetFlagArgVal(const char *flag, const char *arg, const char **val) {
+  if (*arg != '-')
+    return false;
+  while (*++arg == '-')
+    continue;
+  const char *s = strchr(arg, '=');
+  if (s == NULL)  {
+    if (strcmp(flag, arg) != 0)
+      return false;
+    *val = NULL;
+    return true;
+  }
+  size_t n = s - arg;
+  if ((strlen(flag) != n) || (strncmp(flag, arg, n) != 0))
+    return false;
+  *val = s + 1;
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, std::string *val) {
+  const char *valStr;
+  if (!GetFlagArgVal(flag, arg, &valStr))
+    return false;
+  val->assign(valStr ? valStr : "");
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, bool *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success) {
+    *val = (valStr == NULL ||
+            strcasecmp(valStr, "true") == 0 ||
+            strcasecmp(valStr, "on")   == 0 ||
+            strcasecmp(valStr, "yes")  == 0 ||
+            strcasecmp(valStr, "1")    == 0
+      );
+  }
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, float *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtof(valStr, NULL);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, long *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtol(valStr, NULL, 10);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, int *val) {
+  long longVal;
+  bool success = GetFlagArgVal(flag, arg, &longVal);
+  if (success)
+    *val = (int)longVal;
+  return success;
+}
+
+static void Usage() {
+  printf(
+    "UpscalePipelineApp [args ...]\n"
+    "  where args is:\n"
+    "  --in_file=<path>                    input file to be processed\n"
+    "  --out_file=<path>                   output file to be written\n"
+    "  --show                              display the results in a window\n"
+    "  --ar_mode=(0|1)                     mode of artifact reduction filter (0: conservative, 1: aggressive, default 0)\n"
+    "  --upscale_strength=(0 to 1)         strength of upscale filter (float value between 0 to 1)\n"
+    "  --resolution=<height>               the desired height of the output\n"
+    "  --out_height=<height>               the desired height of the output\n"
+    "  --model_dir=<path>                  the path to the directory that contains the models\n"
+    "  --codec=<fourcc>                    the fourcc code for the desired codec (default " DEFAULT_CODEC ")\n"
+    "  --progress                          show progress\n"
+    "  --verbose                           verbose output\n"
+    "  --debug                             print extra debugging information\n"
+  );
+}
+
+static int ParseMyArgs(int argc, char **argv) {
+  int errs = 0;
+  for (--argc, ++argv; argc--; ++argv) {
+    bool help;
+    const char *arg = *argv;
+    if (arg[0] != '-') {
+      continue;
+    } else if ((arg[1] == '-') &&
+      ( GetFlagArgVal("verbose",          arg, &FLAG_verbose)     ||
+        GetFlagArgVal("in",               arg, &FLAG_inFile)      ||
+        GetFlagArgVal("in_file",          arg, &FLAG_inFile)      ||
+        GetFlagArgVal("out",              arg, &FLAG_outFile)     ||
+        GetFlagArgVal("out_file",         arg, &FLAG_outFile)     ||
+        GetFlagArgVal("show",             arg, &FLAG_show)        ||
+        GetFlagArgVal("ar_mode",          arg, &FLAG_arMode)      ||
+        GetFlagArgVal("upscale_strength", arg, &FLAG_upscaleStrength)  ||
+        GetFlagArgVal("resolution",       arg, &FLAG_resolution)  ||
+        GetFlagArgVal("model_dir",        arg, &FLAG_modelDir)    ||
+        GetFlagArgVal("codec",            arg, &FLAG_codec)       ||
+        GetFlagArgVal("progress",         arg, &FLAG_progress)    ||
+        GetFlagArgVal("debug",            arg, &FLAG_debug)
+        )) {
+      continue;
+    } else if (GetFlagArgVal("help", arg, &help)) {
+      return NVCV_ERR_HELP;
+    } else if (arg[1] != '-') {
+      for (++arg; *arg; ++arg) {
+        if (*arg == 'v') {
+          FLAG_verbose = true;
+        } else {
+          printf("Unknown flag ignored: \"-%c\"\n", *arg);
+        }
+      }
+      continue;
+    } else {
+      printf("Unknown flag ignored: \"%s\"\n", arg);
+    }
+  }
+  return errs;
+}
+
+static bool HasSuffix(const char *str, const char *suf) {
+  size_t  strSize = strlen(str),
+    sufSize = strlen(suf);
+  if (strSize < sufSize)
+    return false;
+  return (0 == strcasecmp(suf, str + strSize - sufSize));
+}
+
+static bool HasOneOfTheseSuffixes(const char *str, ...) {
+  bool matches = false;
+  const char *suf;
+  va_list ap;
+  va_start(ap, str);
+  while (nullptr != (suf = va_arg(ap, const char*))) {
+    if (HasSuffix(str, suf)) {
+      matches = true;
+      break;
+    }
+  }
+  va_end(ap);
+  return matches;
+}
+
+static bool IsImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".bmp", ".jpg", ".jpeg", ".png", nullptr);
+}
+
+static bool IsLossyImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".jpg", ".jpeg", nullptr);
+}
+
+static const char* DurationString(double sc) {
+  static char buf[16];
+  int         hr, mn;
+  hr = (int)(sc / 3600.);
+  sc -= hr * 3600.;
+  mn = (int)(sc / 60.);
+  sc -= mn * 60.;
+  snprintf(buf, sizeof(buf), "%02d:%02d:%06.3f", hr, mn, sc);
+  return buf;
+}
+
+struct VideoInfo {
+  int         codec;
+  int         width;
+  int         height;
+  double      frameRate;
+  long long   frameCount;
+};
+
+static void GetVideoInfo(cv::VideoCapture& reader, const char *fileName, VideoInfo *info) {
+  info->codec      =       (int)reader.get(cv::CAP_PROP_FOURCC);
+  info->width      =       (int)reader.get(cv::CAP_PROP_FRAME_WIDTH);
+  info->height     =       (int)reader.get(cv::CAP_PROP_FRAME_HEIGHT);
+  info->frameRate  =    (double)reader.get(cv::CAP_PROP_FPS);
+  info->frameCount = (long long)reader.get(cv::CAP_PROP_FRAME_COUNT);
+  if (FLAG_verbose)
+    printf(
+      "       file \"%s\"\n"
+      "      codec %.4s\n"
+      "      width %4d\n"
+      "     height %4d\n"
+      " frame rate %.3f\n"
+      "frame count %4lld\n"
+      "   duration %s\n",
+      fileName, (char*)&info->codec, info->width, info->height, info->frameRate, info->frameCount,
+      DurationString(info->frameCount / info->frameRate)
+    );
+}
+
+static int StringToFourcc(const std::string& str) {
+    union chint { int i; char c[4]; };
+    chint x = { 0 };
+    for (int n = (str.size() < 4) ? (int)str.size() : 4; n--;)
+      x.c[n] = str[n];
+    return x.i;
+}
+
+struct FXApp {
+  enum Err {
+    errQuit               = +1,                         // Application errors
+    errFlag               = +2,
+    errRead               = +3,
+    errWrite              = +4,
+    errNone               = NVCV_SUCCESS,               // Video Effects SDK errors
+    errGeneral            = NVCV_ERR_GENERAL,
+    errUnimplemented      = NVCV_ERR_UNIMPLEMENTED,
+    errMemory             = NVCV_ERR_MEMORY,
+    errEffect             = NVCV_ERR_EFFECT,
+    errSelector           = NVCV_ERR_SELECTOR,
+    errBuffer             = NVCV_ERR_BUFFER,
+    errParameter          = NVCV_ERR_PARAMETER,
+    errMismatch           = NVCV_ERR_MISMATCH,
+    errPixelFormat        = NVCV_ERR_PIXELFORMAT,
+    errModel              = NVCV_ERR_MODEL,
+    errLibrary            = NVCV_ERR_LIBRARY,
+    errInitialization     = NVCV_ERR_INITIALIZATION,
+    errFileNotFound       = NVCV_ERR_FILE,
+    errFeatureNotFound    = NVCV_ERR_FEATURENOTFOUND,
+    errMissingInput       = NVCV_ERR_MISSINGINPUT,
+    errResolution         = NVCV_ERR_RESOLUTION,
+    errUnsupportedGPU     = NVCV_ERR_UNSUPPORTEDGPU,
+    errWrongGPU           = NVCV_ERR_WRONGGPU,
+    errUnsupportedDriver  = NVCV_ERR_UNSUPPORTEDDRIVER,
+    errCudaMemory         = NVCV_ERR_CUDA_MEMORY,       // CUDA errors
+    errCudaValue          = NVCV_ERR_CUDA_VALUE,
+    errCudaPitch          = NVCV_ERR_CUDA_PITCH,
+    errCudaInit           = NVCV_ERR_CUDA_INIT,
+    errCudaLaunch         = NVCV_ERR_CUDA_LAUNCH,
+    errCudaKernel         = NVCV_ERR_CUDA_KERNEL,
+    errCudaDriver         = NVCV_ERR_CUDA_DRIVER,
+    errCudaUnsupported    = NVCV_ERR_CUDA_UNSUPPORTED,
+    errCudaIllegalAddress = NVCV_ERR_CUDA_ILLEGAL_ADDRESS,
+    errCuda               = NVCV_ERR_CUDA,
+  };
+
+  FXApp()   { _arEff = nullptr; _upscaleEff = nullptr; _inited = false; _showFPS = false; _progress = false;
+              _show = false; _framePeriod = 0.f; }
+  ~FXApp()  { destroyEffects(); }
+
+  void          setShow(bool show) { _show = show; }
+  Err           createEffects(const char *modelDir, NvVFX_EffectSelector first, NvVFX_EffectSelector second);
+  void          destroyEffects();
+  NvCV_Status   allocBuffers(unsigned width, unsigned height);
+  NvCV_Status   allocTempBuffers();
+  Err           processImage(const char *inFile, const char *outFile);
+  Err           processMovie(const char *inFile, const char *outFile);
+  Err           processKey(int key);
+  void          drawFrameRate(cv::Mat& img);
+  Err           appErrFromVfxStatus(NvCV_Status status)  { return (Err)status; }
+  const char*   errorStringFromCode(Err code);
+
+  NvVFX_Handle  _arEff;
+  NvVFX_Handle  _upscaleEff;
+  cv::Mat       _srcImg;
+  cv::Mat       _dstImg;
+  NvCVImage     _srcGpuBuf;
+  NvCVImage     _interGpuBGRf32pl;
+  NvCVImage     _interGpuRGBAu8;
+  NvCVImage     _dstGpuBuf;
+  NvCVImage     _srcVFX;
+  NvCVImage     _dstVFX;
+  NvCVImage     _tmpVFX;
+  bool          _show;
+  bool          _inited;
+  bool          _showFPS;
+  bool          _progress;
+  float         _framePeriod;
+  std::chrono::high_resolution_clock::time_point _lastTime;
+};
+
+
+const char* FXApp::errorStringFromCode(Err code) {
+  struct LutEntry { Err code; const char *str; };
+  static const LutEntry lut[] = {
+    { errRead,    "There was a problem reading a file"                    },
+    { errWrite,   "There was a problem writing a file"                    },
+    { errQuit,    "The user chose to quit the application"                },
+    { errFlag,    "There was a problem with the command-line arguments"   },
+  };
+  if ((int)code <= 0) return NvCV_GetErrorStringFromCode((NvCV_Status)code);
+  for (const LutEntry *p = lut; p != &lut[sizeof(lut) / sizeof(lut[0])]; ++p)
+    if (p->code == code) return p->str;
+  return "UNKNOWN ERROR";
+}
+
+void FXApp::drawFrameRate(cv::Mat &img) {
+  const float timeConstant = 16.f;
+  std::chrono::high_resolution_clock::time_point now = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<float> dur = std::chrono::duration_cast<std::chrono::duration<float>>(now - _lastTime);
+  float t = dur.count();
+  if (0.f < t && t < 100.f) {
+    if (_framePeriod)
+      _framePeriod += (t - _framePeriod) * (1.f / timeConstant);  // 1 pole IIR filter
+    else
+      _framePeriod = t;
+    if (_showFPS) {
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%.1f", 1. / _framePeriod);
+      cv::putText(img, buf, cv::Point(10, img.rows - 10), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(255, 255, 255), 1);
+    }
+  } else {            // Ludicrous time interval; reset
+    _framePeriod = 0.f;  // WAKE UP
+  }
+  _lastTime = now;
+}
+
+
+FXApp::Err FXApp::processKey(int key) {
+  static const int ESC_KEY = 27;
+  switch (key) {
+  case 'Q': case 'q': case ESC_KEY:
+    return errQuit;
+  case 'f': case 'F':
+    _showFPS = !_showFPS;
+    break;
+  case 'p': case 'P': case '%':
+    _progress = !_progress;
+    break;
+  default:
+    break;
+  }
+  return errNone;
+}
+
+FXApp::Err FXApp::createEffects(const char *modelDir, NvVFX_EffectSelector first, NvVFX_EffectSelector second) {
+  NvCV_Status vfxErr;
+  BAIL_IF_ERR(vfxErr = NvVFX_CreateEffect(first, &_arEff));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetString(_arEff, NVVFX_MODEL_DIRECTORY, modelDir));
+  BAIL_IF_ERR(vfxErr = NvVFX_CreateEffect(second, &_upscaleEff));
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+void FXApp::destroyEffects() {
+  NvVFX_DestroyEffect(_arEff);
+  _arEff = nullptr;
+  NvVFX_DestroyEffect(_upscaleEff);
+  _upscaleEff = nullptr;
+}
+
+// Allocate one temp buffer to be used for input and output. Reshaping of the temp buffer in NvCVImage_Transfer() is done automatically,
+// and is very low overhead. We expect the destination to be largest, so we allocate that first to minimize reallocs probablistically.
+// Then we Realloc for the source to get the union of the two.
+// This could alternately be done at runtime by feeding in an empty temp NvCVImage, but there are advantages to allocating all memory at load time.
+NvCV_Status FXApp::allocTempBuffers() {
+  NvCV_Status vfxErr;
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(  &_tmpVFX, _dstVFX.width, _dstVFX.height, _dstVFX.pixelFormat, _dstVFX.componentType, _dstVFX.planar, NVCV_GPU, 0));
+  BAIL_IF_ERR(vfxErr = NvCVImage_Realloc(&_tmpVFX, _srcVFX.width, _srcVFX.height, _srcVFX.pixelFormat, _srcVFX.componentType, _srcVFX.planar, NVCV_GPU, 0));
+bail:
+  return vfxErr;
+}
+
+NvCV_Status FXApp::allocBuffers(unsigned width, unsigned height) {
+  NvCV_Status vfxErr = NVCV_SUCCESS;
+  int dstWidth;
+
+  if (_inited)
+    return NVCV_SUCCESS;
+
+  if (!_srcImg.data) {
+    _srcImg.create(height, width, CV_8UC3);                                                         // src CPU
+    BAIL_IF_NULL(_srcImg.data, vfxErr, NVCV_ERR_MEMORY);
+  }
+
+  if (!FLAG_resolution) {
+    printf("--resolution has not been specified\n");
+    return NVCV_ERR_PARAMETER;
+  }
+  dstWidth = _srcImg.cols * FLAG_resolution / _srcImg.rows;
+  _dstImg.create(FLAG_resolution, dstWidth, _srcImg.type());                                        // dst CPU
+  BAIL_IF_NULL(_dstImg.data, vfxErr, NVCV_ERR_MEMORY);
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_srcGpuBuf,   _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // src GPU
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_interGpuBGRf32pl, _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // intermediate GPU
+  BAIL_IF_ERR(vfxErr = NvVFX_SetF32(_upscaleEff, NVVFX_STRENGTH, FLAG_upscaleStrength));
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_interGpuRGBAu8, _srcImg.cols, _srcImg.rows, NVCV_RGBA, NVCV_U8,
+                                       NVCV_INTERLEAVED, NVCV_GPU, 32));                            // intermediate GPU
+
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_dstGpuBuf, _dstImg.cols, _dstImg.rows, NVCV_RGBA, NVCV_U8, NVCV_INTERLEAVED,
+                                       NVCV_GPU, 32));                                              // dst GPU
+  NVWrapperForCVMat(&_srcImg, &_srcVFX);      // _srcVFX is an alias for _srcImg
+  NVWrapperForCVMat(&_dstImg, &_dstVFX);      // _dstVFX is an alias for _dstImg
+
+  //#define ALLOC_TEMP_BUFFERS_AT_RUN_TIME    // Deferring temp buffer allocation is easier
+  #ifndef ALLOC_TEMP_BUFFERS_AT_RUN_TIME      // Allocating temp buffers at load time avoids run time hiccups
+    BAIL_IF_ERR(vfxErr = allocTempBuffers()); // This uses _srcVFX and _dstVFX and allocates one buffer to be a temporary for src and dst
+  #endif // ALLOC_TEMP_BUFFERS_AT_RUN_TIME
+
+  _inited = true;
+
+bail:
+  return vfxErr;
+}
+
+FXApp::Err FXApp::processImage(const char *inFile, const char *outFile) {
+  CUstream    stream  = 0;
+  NvCV_Status vfxErr;
+
+  if (!_arEff || !_upscaleEff)
+    return errEffect;
+  _srcImg = cv::imread(inFile);
+  if (!_srcImg.data)
+    return errRead;
+
+  BAIL_IF_ERR(vfxErr = allocBuffers(_srcImg.cols, _srcImg.rows));
+
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_srcGpuBuf, 1.f/255.f, stream, &_tmpVFX)); // _srcTmpVFX--> _dstTmpVFX --> _srcGpuBuf
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_arEff, NVVFX_INPUT_IMAGE,  &_srcGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_arEff, NVVFX_OUTPUT_IMAGE, &_interGpuBGRf32pl));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetCudaStream(_arEff, NVVFX_CUDA_STREAM, stream));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetU32(_arEff, NVVFX_MODE, FLAG_arMode));
+
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_upscaleEff, NVVFX_INPUT_IMAGE, &_interGpuRGBAu8));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_upscaleEff, NVVFX_OUTPUT_IMAGE, &_dstGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetCudaStream(_upscaleEff, NVVFX_CUDA_STREAM, stream));
+
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_arEff));
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_upscaleEff));
+  BAIL_IF_ERR(vfxErr = NvVFX_Run(_arEff, 0));                                             // _srcGpuBuf --> _interGpuBuf
+  // transfer between intermediate buffers if selected method is Upscale
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_interGpuBGRf32pl, &_interGpuRGBAu8, 255.f, stream, &_tmpVFX));
+  BAIL_IF_ERR(vfxErr = NvVFX_Run(_upscaleEff, 0));                                        // _interGpuBuf --> _dstGpuBuf
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_dstGpuBuf, &_dstVFX, 1.f, stream, &_tmpVFX)); // _dstGpuBuf --> _dstTmpVFX --> _dstVFX
+
+  if (outFile && outFile[0]) {
+    if(IsLossyImageFile(outFile))
+      fprintf(stderr, "WARNING: JPEG output file format will reduce image quality\n");
+    if (!cv::imwrite(outFile, _dstImg)) {
+      printf("Error writing: \"%s\"\n", outFile);
+      return errWrite;
+    }
+  }
+  if (_show) {
+    cv::imshow("Output", _dstImg);
+    cv::waitKey(3000);
+  }
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+FXApp::Err FXApp::processMovie(const char *inFile, const char *outFile) {
+  const int       fourcc_h264 = cv::VideoWriter::fourcc('H','2','6','4');
+  CUstream        stream      = 0;
+  FXApp::Err      appErr      = errNone;
+  bool            ok;
+  cv::VideoWriter writer;
+  NvCV_Status     vfxErr;
+  unsigned        frameNum;
+  VideoInfo       info;
+
+  cv::VideoCapture reader(inFile);
+  if (!reader.isOpened()) {
+    printf("Error: Could not open video: \"%s\"\n", inFile);
+    return errRead;
+  }
+
+  GetVideoInfo(reader, inFile, &info);
+  if (!(fourcc_h264 == info.codec || cv::VideoWriter::fourcc('a','v','c','1') == info.codec)) // avc1 is alias for h264
+    printf("Filters only target H264 videos, not %.4s\n", (char*)&info.codec);
+
+  BAIL_IF_ERR(vfxErr = allocBuffers(info.width, info.height));
+
+  if (outFile && !outFile[0]) outFile = nullptr;
+  if (outFile) {
+    ok = writer.open(outFile, StringToFourcc(FLAG_codec), info.frameRate, cv::Size(_dstVFX.width, _dstVFX.height));
+    if (!ok) {
+      printf("Cannot open \"%s\" for video writing\n", outFile);
+      outFile = nullptr;
+      if (!_show)
+        return errWrite;
+    }
+  }
+
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_arEff, NVVFX_INPUT_IMAGE,  &_srcGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_arEff, NVVFX_OUTPUT_IMAGE, &_interGpuBGRf32pl));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetCudaStream(_arEff, NVVFX_CUDA_STREAM, stream));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetU32(_arEff, NVVFX_MODE, FLAG_arMode));
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_arEff));
+
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_upscaleEff, NVVFX_INPUT_IMAGE, &_interGpuRGBAu8));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_upscaleEff, NVVFX_OUTPUT_IMAGE, &_dstGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetCudaStream(_upscaleEff, NVVFX_CUDA_STREAM, stream));
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_upscaleEff));
+
+  for (frameNum = 0; reader.read(_srcImg); ++frameNum) {
+    if (_srcImg.empty()) {
+      printf("Frame %u is empty\n", frameNum);
+    }
+
+    // _srcVFX   --> _srcTmpVFX --> _srcGpuBuf --> _interGpuBuf --> _dstGpuBuf --> _dstTmpVFX --> _dstVFX
+    BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_srcGpuBuf, 1.f/255.f, stream, &_tmpVFX));
+    BAIL_IF_ERR(vfxErr = NvVFX_Run(_arEff, 0));
+    // transfer between intermediate buffers if selected method is Upscale
+    BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_interGpuBGRf32pl, &_interGpuRGBAu8, 255.f, stream, &_tmpVFX));
+    BAIL_IF_ERR(vfxErr = NvVFX_Run(_upscaleEff, 0));
+    BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_dstGpuBuf, &_dstVFX, 1.f, stream, &_tmpVFX));
+
+    if (outFile)
+      writer.write(_dstImg);
+    if (_show) {
+      drawFrameRate(_dstImg);
+      cv::imshow("Output", _dstImg);
+      int key= cv::waitKey(1);
+      if (key > 0) {
+          appErr = processKey(key);
+          if (errQuit == appErr)
+            break;
+      }
+    }
+    if (_progress)
+      fprintf(stderr, "\b\b\b\b%3.0f%%", 100.f * frameNum / info.frameCount);
+  }
+
+  if (_progress) fprintf(stderr, "\n");
+  reader.release();
+  if (outFile)
+    writer.release();
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+int main(int argc, char **argv) {
+  int         nErrs = 0;
+  FXApp::Err  fxErr = FXApp::errNone;
+  FXApp       app;
+
+  nErrs = ParseMyArgs(argc, argv);
+  if (nErrs)
+    std::cerr << nErrs << " command line syntax problems\n";
+
+  if (FLAG_inFile.empty()) {
+    std::cerr << "Please specify --in_file=XXX\n";
+    ++nErrs;
+  }
+  if (FLAG_outFile.empty() && !FLAG_show) {
+    std::cerr << "Please specify --out_file=XXX or --show\n";
+    ++nErrs;
+  }
+
+  app._progress = FLAG_progress;
+  app.setShow(FLAG_show);
+
+  if (nErrs) {
+    Usage();
+    fxErr = FXApp::errFlag;
+  }
+  else {
+    NvVFX_EffectSelector first = NVVFX_FX_ARTIFACT_REDUCTION;
+    NvVFX_EffectSelector second = NVVFX_FX_SR_UPSCALE;
+
+    fxErr = app.createEffects(FLAG_modelDir.c_str(), first, second);
+    if (FXApp::errNone != fxErr) {
+      std::cerr << "Error creating effects \"" << first << " & " << second << "\"\n";
+    }
+    else {
+      if (IsImageFile(FLAG_inFile.c_str()))
+        fxErr = app.processImage(FLAG_inFile.c_str(), FLAG_outFile.c_str());
+      else
+        fxErr = app.processMovie(FLAG_inFile.c_str(), FLAG_outFile.c_str());
+    }
+  }
+
+  if (fxErr)
+    std::cerr << "Error: " << app.errorStringFromCode(fxErr) << std::endl;
+  return (int)fxErr;
+}
--- a/samples/UpscalePipelineApp/UpscalePipelineApp.exe
+++ b/samples/UpscalePipelineApp/UpscalePipelineApp.exe
--- a/samples/UpscalePipelineApp/run.bat
+++ b/samples/UpscalePipelineApp/run.bat
@ -0,0 +1,6 @@
+SETLOCAL
+SET PATH=%PATH%;..\external\opencv\bin;
+REM Use --show to show the output in a window or use --out_file=<filename> to write output to file
+UpscalePipelineApp.exe --in_file=..\input\input1.jpg --ar_mode=0 --upscale_strength=0 --resolution=1080 --show --out_file=ar_sr_0.png
+UpscalePipelineApp.exe --in_file=..\input\input1.jpg --ar_mode=0 --upscale_strength=1 --resolution=1080 --show --out_file=ar_sr_1.png
+
--- a/samples/VideoEffectsApp/CMakeLists.txt
+++ b/samples/VideoEffectsApp/CMakeLists.txt
@ -0,0 +1,35 @@
+set(SOURCE_FILES VideoEffectsApp.cpp ../../nvvfx/src/nvVideoEffectsProxy.cpp ../../nvvfx/src/nvCVImageProxy.cpp)
+
+# Set Visual Studio source filters
+source_group("Source Files" FILES ${SOURCE_FILES})
+
+add_executable(VideoEffectsApp ${SOURCE_FILES})
+target_include_directories(VideoEffectsApp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../utils)
+target_include_directories(VideoEffectsApp PUBLIC ${SDK_INCLUDES_PATH})
+
+if(MSVC)
+    target_link_libraries(VideoEffectsApp PUBLIC
+        opencv346
+        NVVideoEffects
+        ${CMAKE_CURRENT_SOURCE_DIR}/../external/cuda/lib/x64/cudart.lib
+        )
+
+    set(OPENCV_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../external/opencv/bin)
+    set(VFXSDK_PATH_STR ${CMAKE_CURRENT_SOURCE_DIR}/../../bin) # Also the location for CUDA/NVTRT/libcrypto
+    set(PATH_STR "PATH=%PATH%" ${VFXSDK_PATH_STR} ${OPENCV_PATH_STR})
+    set(CMD_ARG_STR "--model_dir=\"${CMAKE_CURRENT_SOURCE_DIR}/../../bin/models\" --show --effect=SuperRes --resolution=1080 --in_file=\"${CMAKE_CURRENT_SOURCE_DIR}/../input/input1.jpg\"")
+    set_target_properties(VideoEffectsApp PROPERTIES
+        FOLDER SampleApps
+        VS_DEBUGGER_ENVIRONMENT "${PATH_STR}"
+        VS_DEBUGGER_COMMAND_ARGUMENTS "${CMD_ARG_STR}"
+        )
+else()
+
+    target_link_libraries(VideoEffectsApp PUBLIC
+        NVVideoEffects
+        NVCVImage
+        OpenCV
+        TensorRT
+        CUDA
+        )
+endif()
--- a/samples/VideoEffectsApp/VideoEffectsApp.cpp
+++ b/samples/VideoEffectsApp/VideoEffectsApp.cpp
@ -0,0 +1,751 @@
+/*###############################################################################
+#
+# Copyright (c) 2020 NVIDIA Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+###############################################################################*/
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <chrono>
+#include <string>
+#include <iostream>
+
+#include "nvCVOpenCV.h"
+#include "nvVideoEffects.h"
+#include "opencv2/opencv.hpp"
+
+
+#ifdef _MSC_VER
+  #define strcasecmp _stricmp
+  #include <Windows.h>
+#else // !_MSC_VER
+  #include <sys/stat.h>
+#endif // _MSC_VER
+
+#define BAIL_IF_ERR(err)                    do { if (0 != (err)) {                      goto bail; } } while(0)
+#define BAIL_IF_NULL(x, err, code)          do { if ((void*)(x) == NULL)  { err = code; goto bail; } } while(0)
+#define NVCV_ERR_HELP 411
+
+#ifdef _WIN32
+  #define DEFAULT_CODEC "avc1"
+#else // !_WIN32
+  #define DEFAULT_CODEC "H264"
+#endif // _WIN32
+
+
+bool        FLAG_debug          = false,
+            FLAG_verbose        = false,
+            FLAG_show           = false,
+            FLAG_progress       = false,
+            FLAG_webcam         = false;
+float       FLAG_strength       = 0.f;
+int         FLAG_mode           = 0;
+int         FLAG_resolution     = 0;
+std::string FLAG_codec          = DEFAULT_CODEC,
+            FLAG_camRes         = "1280x720",
+            FLAG_inFile,
+            FLAG_outFile,
+            FLAG_outDir,
+            FLAG_modelDir,
+            FLAG_effect;
+
+// Set this when using OTA Updates
+// This path is used by nvVideoEffectsProxy.cpp to load the SDK dll
+// when using  OTA Updates
+char *g_nvVFXSDKPath = NULL;
+
+static bool GetFlagArgVal(const char *flag, const char *arg, const char **val) {
+  if (*arg != '-')
+    return false;
+  while (*++arg == '-')
+    continue;
+  const char *s = strchr(arg, '=');
+  if (s == NULL)  {
+    if (strcmp(flag, arg) != 0)
+      return false;
+    *val = NULL;
+    return true;
+  }
+  size_t n = s - arg;
+  if ((strlen(flag) != n) || (strncmp(flag, arg, n) != 0))
+    return false;
+  *val = s + 1;
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, std::string *val) {
+  const char *valStr;
+  if (!GetFlagArgVal(flag, arg, &valStr))
+    return false;
+  val->assign(valStr ? valStr : "");
+  return true;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, bool *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success) {
+    *val = (valStr == NULL ||
+            strcasecmp(valStr, "true") == 0 ||
+            strcasecmp(valStr, "on")   == 0 ||
+            strcasecmp(valStr, "yes")  == 0 ||
+            strcasecmp(valStr, "1")    == 0
+      );
+  }
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, float *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtof(valStr, NULL);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, long *val) {
+  const char *valStr;
+  bool success = GetFlagArgVal(flag, arg, &valStr);
+  if (success)
+    *val = strtol(valStr, NULL, 10);
+  return success;
+}
+
+static bool GetFlagArgVal(const char *flag, const char *arg, int *val) {
+  long longVal;
+  bool success = GetFlagArgVal(flag, arg, &longVal);
+  if (success)
+    *val = (int)longVal;
+  return success;
+}
+
+static void Usage() {
+  printf(
+    "VideoEffectsApp [args ...]\n"
+    "  where args is:\n"
+    "  --in_file=<path>           input file to be processed\n"
+    "  --webcam                   use a webcam as the input\n"
+    "  --out_file=<path>          output file to be written\n"
+    "  --effect=<effect>          the effect to apply\n"
+    "  --show                     display the results in a window (for webcam, it is always true)\n"
+    "  --strength=<value>         strength of the upscaling effect, [0.0, 1.0]\n"
+    "  --mode=<value>             mode of the super res or artifact reduction effect, 0 or 1, \n"
+    "                             where 0 - conservative and 1 - aggressive\n"
+    "  --cam_res=[WWWx]HHH        specify camera resolution as height or width x height\n"
+    "                             supports 720 and 1080 resolutions (default \"720\") \n"
+    "  --resolution=<height>      the desired height of the output\n"
+    "  --model_dir=<path>         the path to the directory that contains the models\n"
+    "  --codec=<fourcc>           the fourcc code for the desired codec (default " DEFAULT_CODEC ")\n"
+    "  --progress                 show progress\n"
+    "  --verbose                  verbose output\n"
+    "  --debug                    print extra debugging information\n"
+  );
+  const char* cStr;
+  NvCV_Status err = NvVFX_GetString(nullptr, NVVFX_INFO, &cStr);
+  if (NVCV_SUCCESS != err)
+    printf("Cannot get effects: %s\n", NvCV_GetErrorStringFromCode(err));
+  printf("where effects are:\n%s", cStr);
+}
+
+static int ParseMyArgs(int argc, char **argv) {
+  int errs = 0;
+  for (--argc, ++argv; argc--; ++argv) {
+    bool help;
+    const char *arg = *argv;
+    if (arg[0] != '-') {
+      continue;
+    } else if ((arg[1] == '-') &&
+      ( GetFlagArgVal("verbose",      arg, &FLAG_verbose)     ||
+        GetFlagArgVal("in",           arg, &FLAG_inFile)      ||
+        GetFlagArgVal("in_file",      arg, &FLAG_inFile)      ||
+        GetFlagArgVal("out",          arg, &FLAG_outFile)     ||
+        GetFlagArgVal("out_file",     arg, &FLAG_outFile)     ||
+        GetFlagArgVal("effect",       arg, &FLAG_effect)      ||
+        GetFlagArgVal("show",         arg, &FLAG_show)        ||
+        GetFlagArgVal("webcam",       arg, &FLAG_webcam)      ||
+        GetFlagArgVal("cam_res",      arg, &FLAG_camRes)      ||
+        GetFlagArgVal("strength",     arg, &FLAG_strength)    ||
+        GetFlagArgVal("mode",         arg, &FLAG_mode)        ||
+        GetFlagArgVal("resolution",   arg, &FLAG_resolution)  ||
+        GetFlagArgVal("model_dir",    arg, &FLAG_modelDir)    ||
+        GetFlagArgVal("codec",        arg, &FLAG_codec)       ||
+        GetFlagArgVal("progress",     arg, &FLAG_progress)    ||
+        GetFlagArgVal("debug",        arg, &FLAG_debug)
+        )) {
+      continue;
+    } else if (GetFlagArgVal("help", arg, &help)) {
+      return NVCV_ERR_HELP;
+    } else if (arg[1] != '-') {
+      for (++arg; *arg; ++arg) {
+        if (*arg == 'v') {
+          FLAG_verbose = true;
+        } else {
+          printf("Unknown flag ignored: \"-%c\"\n", *arg);
+        }
+      }
+      continue;
+    } else {
+      printf("Unknown flag ignored: \"%s\"\n", arg);
+    }
+  }
+  return errs;
+}
+
+static bool HasSuffix(const char *str, const char *suf) {
+  size_t  strSize = strlen(str),
+    sufSize = strlen(suf);
+  if (strSize < sufSize)
+    return false;
+  return (0 == strcasecmp(suf, str + strSize - sufSize));
+}
+
+static bool HasOneOfTheseSuffixes(const char *str, ...) {
+  bool matches = false;
+  const char *suf;
+  va_list ap;
+  va_start(ap, str);
+  while (nullptr != (suf = va_arg(ap, const char*))) {
+    if (HasSuffix(str, suf)) {
+      matches = true;
+      break;
+    }
+  }
+  va_end(ap);
+  return matches;
+}
+
+static bool IsImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".bmp", ".jpg", ".jpeg", ".png", nullptr);
+}
+
+static bool IsLossyImageFile(const char *str) {
+  return HasOneOfTheseSuffixes(str, ".jpg", ".jpeg", nullptr);
+}
+
+
+static const char* DurationString(double sc) {
+  static char buf[16];
+  int         hr, mn;
+  hr = (int)(sc / 3600.);
+  sc -= hr * 3600.;
+  mn = (int)(sc / 60.);
+  sc -= mn * 60.;
+  snprintf(buf, sizeof(buf), "%02d:%02d:%06.3f", hr, mn, sc);
+  return buf;
+}
+
+struct VideoInfo {
+  int         codec;
+  int         width;
+  int         height;
+  double      frameRate;
+  long long   frameCount;
+};
+
+static void GetVideoInfo(cv::VideoCapture& reader, const char *fileName, VideoInfo *info) {
+  info->codec      =       (int)reader.get(cv::CAP_PROP_FOURCC);
+  info->width      =       (int)reader.get(cv::CAP_PROP_FRAME_WIDTH);
+  info->height     =       (int)reader.get(cv::CAP_PROP_FRAME_HEIGHT);
+  info->frameRate  =    (double)reader.get(cv::CAP_PROP_FPS);
+  info->frameCount = (long long)reader.get(cv::CAP_PROP_FRAME_COUNT);
+  if (FLAG_verbose)
+    printf(
+      "       file \"%s\"\n"
+      "      codec %.4s\n"
+      "      width %4d\n"
+      "     height %4d\n"
+      " frame rate %.3f\n"
+      "frame count %4lld\n"
+      "   duration %s\n",
+      fileName, (char*)&info->codec, info->width, info->height, info->frameRate, info->frameCount,
+      DurationString(info->frameCount / info->frameRate)
+    );
+}
+
+static int StringToFourcc(const std::string& str) {
+    union chint { int i; char c[4]; };
+    chint x = { 0 };
+    for (int n = (str.size() < 4) ? (int)str.size() : 4; n--;)
+      x.c[n] = str[n];
+    return x.i;
+}
+
+struct FXApp {
+  enum Err {
+    errQuit               = +1,                         // Application errors
+    errFlag               = +2,
+    errRead               = +3,
+    errWrite              = +4,
+    errNone               = NVCV_SUCCESS,               // Video Effects SDK errors
+    errGeneral            = NVCV_ERR_GENERAL,
+    errUnimplemented      = NVCV_ERR_UNIMPLEMENTED,
+    errMemory             = NVCV_ERR_MEMORY,
+    errEffect             = NVCV_ERR_EFFECT,
+    errSelector           = NVCV_ERR_SELECTOR,
+    errBuffer             = NVCV_ERR_BUFFER,
+    errParameter          = NVCV_ERR_PARAMETER,
+    errMismatch           = NVCV_ERR_MISMATCH,
+    errPixelFormat        = NVCV_ERR_PIXELFORMAT,
+    errModel              = NVCV_ERR_MODEL,
+    errLibrary            = NVCV_ERR_LIBRARY,
+    errInitialization     = NVCV_ERR_INITIALIZATION,
+    errFileNotFound       = NVCV_ERR_FILE,
+    errFeatureNotFound    = NVCV_ERR_FEATURENOTFOUND,
+    errMissingInput       = NVCV_ERR_MISSINGINPUT,
+    errResolution         = NVCV_ERR_RESOLUTION,
+    errUnsupportedGPU     = NVCV_ERR_UNSUPPORTEDGPU,
+    errWrongGPU           = NVCV_ERR_WRONGGPU,
+    errUnsupportedDriver  = NVCV_ERR_UNSUPPORTEDDRIVER,
+    errCudaMemory         = NVCV_ERR_CUDA_MEMORY,       // CUDA errors
+    errCudaValue          = NVCV_ERR_CUDA_VALUE,
+    errCudaPitch          = NVCV_ERR_CUDA_PITCH,
+    errCudaInit           = NVCV_ERR_CUDA_INIT,
+    errCudaLaunch         = NVCV_ERR_CUDA_LAUNCH,
+    errCudaKernel         = NVCV_ERR_CUDA_KERNEL,
+    errCudaDriver         = NVCV_ERR_CUDA_DRIVER,
+    errCudaUnsupported    = NVCV_ERR_CUDA_UNSUPPORTED,
+    errCudaIllegalAddress = NVCV_ERR_CUDA_ILLEGAL_ADDRESS,
+    errCuda               = NVCV_ERR_CUDA,
+  };
+
+  FXApp()   { _eff = nullptr; _effectName = nullptr; _inited = false; _showFPS = false; _progress = false;
+              _show = false; _enableEffect = true, _drawVisualization = true, _framePeriod = 0.f; }
+  ~FXApp()  { NvVFX_DestroyEffect(_eff); }
+
+  void          setShow(bool show) { _show = show; }
+  Err           createEffect(const char *effectSelector, const char *modelDir);
+  void          destroyEffect();
+  NvCV_Status   allocBuffers(unsigned width, unsigned height);
+  NvCV_Status   allocTempBuffers();
+  Err           processImage(const char *inFile, const char *outFile);
+  Err           processMovie(const char *inFile, const char *outFile);
+  Err           initCamera(cv::VideoCapture& cap);
+  Err           processKey(int key);
+  void          drawFrameRate(cv::Mat& img);
+  void          drawEffectStatus(cv::Mat& img);
+  Err           appErrFromVfxStatus(NvCV_Status status)  { return (Err)status; }
+  const char*   errorStringFromCode(Err code);
+
+  NvVFX_Handle  _eff;
+  cv::Mat       _srcImg;
+  cv::Mat       _dstImg;
+  NvCVImage     _srcGpuBuf;
+  NvCVImage     _dstGpuBuf;
+  NvCVImage     _srcVFX;
+  NvCVImage     _dstVFX;
+  NvCVImage     _tmpVFX;  // We use the same temporary buffer for source and dst, since it auto-shapes as needed
+  bool          _show;
+  bool          _inited;
+  bool          _showFPS;
+  bool          _progress;
+  bool          _enableEffect;
+  bool          _drawVisualization;
+  const char*   _effectName;
+  float         _framePeriod;
+  std::chrono::high_resolution_clock::time_point _lastTime;
+};
+
+const char* FXApp::errorStringFromCode(Err code) {
+  struct LutEntry { Err code; const char *str; };
+  static const LutEntry lut[] = {
+    { errRead,    "There was a problem reading a file"                    },
+    { errWrite,   "There was a problem writing a file"                    },
+    { errQuit,    "The user chose to quit the application"                },
+    { errFlag,    "There was a problem with the command-line arguments"   },
+  };
+  if ((int)code <= 0) return NvCV_GetErrorStringFromCode((NvCV_Status)code);
+  for (const LutEntry *p = lut; p != &lut[sizeof(lut) / sizeof(lut[0])]; ++p)
+    if (p->code == code) return p->str;
+  return "UNKNOWN ERROR";
+}
+
+void FXApp::drawFrameRate(cv::Mat &img) {
+  const float timeConstant = 16.f;
+  std::chrono::high_resolution_clock::time_point now = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<float> dur = std::chrono::duration_cast<std::chrono::duration<float>>(now - _lastTime);
+  float t = dur.count();
+  if (0.f < t && t < 100.f) {
+    if (_framePeriod)
+      _framePeriod += (t - _framePeriod) * (1.f / timeConstant);  // 1 pole IIR filter
+    else
+      _framePeriod = t;
+    if (_showFPS) {
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%.1f", 1. / _framePeriod);
+      cv::putText(img, buf, cv::Point(10, img.rows - 10), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(255, 255, 255), 1);
+    }
+  } else {            // Ludicrous time interval; reset
+    _framePeriod = 0.f;  // WAKE UP
+  }
+  _lastTime = now;
+}
+
+FXApp::Err FXApp::processKey(int key) {
+  static const int ESC_KEY = 27;
+  switch (key) {
+  case 'Q': case 'q': case ESC_KEY:
+    return errQuit;
+  case 'f': case 'F':
+    _showFPS = !_showFPS;
+    break;
+  case 'p': case 'P': case '%':
+    _progress = !_progress;
+  case 'e': case 'E':
+    break;
+  case 'd': case'D':
+    if (FLAG_webcam)
+      _drawVisualization = !_drawVisualization;
+    break;
+  default:
+    break;
+  }
+  return errNone;
+}
+
+FXApp::Err FXApp::initCamera(cv::VideoCapture& cap) {
+  const int camIndex = 0;
+  cap.open(camIndex);
+  if (!FLAG_camRes.empty()) {
+    int camWidth, camHeight, n;
+    n = sscanf(FLAG_camRes.c_str(), "%d%*[xX]%d", &camWidth, &camHeight);
+    switch (n) {
+    case 2:
+      break;  // We have read both width and height
+    case 1:
+      camHeight = camWidth;
+      camWidth = (int)(camHeight * (16. / 9.) + .5);
+      break;
+    default:
+      camHeight = 0;
+      camWidth = 0;
+      break;
+    }
+
+    if (camWidth) cap.set(cv::CAP_PROP_FRAME_WIDTH, camWidth);
+    if (camHeight) cap.set(cv::CAP_PROP_FRAME_HEIGHT, camHeight);
+    if (camWidth != cap.get(cv::CAP_PROP_FRAME_WIDTH) || camHeight != cap.get(cv::CAP_PROP_FRAME_HEIGHT)) {
+      printf("Error: Camera does not support %d x %d resolution\n", camWidth, camHeight);
+      return errGeneral;
+    }
+  }
+  return errNone;
+}
+
+void FXApp::drawEffectStatus(cv::Mat& img) {
+  char buf[32];
+  snprintf(buf, sizeof(buf), "Effect: %s", _enableEffect ? "on" : "off");
+  cv::putText(img, buf, cv::Point(10, img.rows - 40), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(255, 255, 255), 1);
+}
+
+FXApp::Err FXApp::createEffect(const char *effectSelector, const char *modelDir) {
+  NvCV_Status vfxErr;
+  BAIL_IF_ERR(vfxErr = NvVFX_CreateEffect(effectSelector, &_eff));
+  _effectName = effectSelector;
+  // Do not set NVVFX_MODEL_DIRECTORY for NVVFX_FX_SR_UPSCALE feature as it is not a valid selector for that feature
+  if (modelDir[0] != '\0' && strcmp(_effectName, NVVFX_FX_SR_UPSCALE)){
+    BAIL_IF_ERR(vfxErr = NvVFX_SetString(_eff, NVVFX_MODEL_DIRECTORY, modelDir));
+  }
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+void FXApp::destroyEffect() {
+  NvVFX_DestroyEffect(_eff);
+  _eff = nullptr;
+}
+
+// Allocate one temp buffer to be used for input and output. Reshaping of the temp buffer in NvCVImage_Transfer() is done automatically,
+// and is very low overhead. We expect the destination to be largest, so we allocate that first to minimize reallocs probablistically.
+// Then we Realloc for the source to get the union of the two.
+// This could alternately be done at runtime by feeding in an empty temp NvCVImage, but there are advantages to allocating all memory at load time.
+NvCV_Status FXApp::allocTempBuffers() {
+  NvCV_Status vfxErr;
+  BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(  &_tmpVFX, _dstVFX.width, _dstVFX.height, _dstVFX.pixelFormat, _dstVFX.componentType, _dstVFX.planar, NVCV_GPU, 0));
+  BAIL_IF_ERR(vfxErr = NvCVImage_Realloc(&_tmpVFX, _srcVFX.width, _srcVFX.height, _srcVFX.pixelFormat, _srcVFX.componentType, _srcVFX.planar, NVCV_GPU, 0));
+bail:
+  return vfxErr;
+}
+
+static NvCV_Status CheckScaleIsotropy(const NvCVImage *src, const NvCVImage *dst) {
+  if (src->width * dst->height != src->height * dst->width) {
+    printf("%ux%u --> %ux%u: different scale for width and height is not supported\n",
+      src->width, src->height, dst->width, dst->height);
+    return NVCV_ERR_RESOLUTION;
+  }
+  return NVCV_SUCCESS;
+}
+
+NvCV_Status FXApp::allocBuffers(unsigned width, unsigned height) {
+  NvCV_Status  vfxErr = NVCV_SUCCESS;
+
+  if (_inited)
+    return NVCV_SUCCESS;
+
+  if (!_srcImg.data) {
+    _srcImg.create(height, width, CV_8UC3);                                                                                        // src CPU
+    BAIL_IF_NULL(_srcImg.data, vfxErr, NVCV_ERR_MEMORY);
+  }
+  if (!strcmp(_effectName, NVVFX_FX_TRANSFER)) {
+    _dstImg.create(_srcImg.rows, _srcImg.cols, _srcImg.type());                                                                    // dst CPU
+    BAIL_IF_NULL(_dstImg.data, vfxErr, NVCV_ERR_MEMORY);
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_srcGpuBuf, _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // src GPU
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_dstGpuBuf, _dstImg.cols, _dstImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // dst GPU
+  }
+  else if (!strcmp(_effectName, NVVFX_FX_ARTIFACT_REDUCTION)) {
+    _dstImg.create(_srcImg.rows, _srcImg.cols, _srcImg.type());                                                                    // dst CPU
+    BAIL_IF_NULL(_dstImg.data, vfxErr, NVCV_ERR_MEMORY);
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_srcGpuBuf, _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // src GPU
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_dstGpuBuf, _dstImg.cols, _dstImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // dst GPU
+  }
+  else if (!strcmp(_effectName, NVVFX_FX_SUPER_RES)) {
+    if (!FLAG_resolution) {
+      printf("--resolution has not been specified\n");
+      return NVCV_ERR_PARAMETER;
+    }
+    BAIL_IF_ERR(vfxErr = NvVFX_SetF32(_eff, NVVFX_STRENGTH, FLAG_strength));
+    int dstWidth = _srcImg.cols * FLAG_resolution / _srcImg.rows;
+    _dstImg.create(FLAG_resolution, dstWidth, _srcImg.type());                                                                     // dst CPU
+    BAIL_IF_NULL(_dstImg.data, vfxErr, NVCV_ERR_MEMORY);
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_srcGpuBuf, _srcImg.cols, _srcImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // src GPU
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_dstGpuBuf, _dstImg.cols, _dstImg.rows, NVCV_BGR, NVCV_F32, NVCV_PLANAR, NVCV_GPU, 1));  // dst GPU
+    BAIL_IF_ERR(vfxErr = CheckScaleIsotropy(&_srcGpuBuf, &_dstGpuBuf));
+  }
+  else if (!strcmp(_effectName, NVVFX_FX_SR_UPSCALE)) {
+    if (!FLAG_resolution) {
+      printf("--resolution has not been specified\n");
+      return NVCV_ERR_PARAMETER;
+    }
+
+    BAIL_IF_ERR(vfxErr = NvVFX_SetF32(_eff, NVVFX_STRENGTH, FLAG_strength));
+    int dstWidth = _srcImg.cols * FLAG_resolution / _srcImg.rows;
+    _dstImg.create(FLAG_resolution, dstWidth, _srcImg.type());  // dst CPU
+    BAIL_IF_NULL(_dstImg.data, vfxErr, NVCV_ERR_MEMORY);
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_srcGpuBuf, _srcImg.cols, _srcImg.rows, NVCV_RGBA, NVCV_U8, NVCV_INTERLEAVED,
+                                         NVCV_GPU, 32));  // src GPU
+    BAIL_IF_ERR(vfxErr = NvCVImage_Alloc(&_dstGpuBuf, _dstImg.cols, _dstImg.rows, NVCV_RGBA, NVCV_U8, NVCV_INTERLEAVED,
+                                         NVCV_GPU, 32));  // dst GPU
+    BAIL_IF_ERR(vfxErr = CheckScaleIsotropy(&_srcGpuBuf, &_dstGpuBuf));
+  }
+  NVWrapperForCVMat(&_srcImg, &_srcVFX);      // _srcVFX is an alias for _srcImg
+  NVWrapperForCVMat(&_dstImg, &_dstVFX);      // _dstVFX is an alias for _dstImg
+
+  //#define ALLOC_TEMP_BUFFERS_AT_RUN_TIME    // Deferring temp buffer allocation is easier
+  #ifndef ALLOC_TEMP_BUFFERS_AT_RUN_TIME      // Allocating temp buffers at load time avoids run time hiccups
+    BAIL_IF_ERR(vfxErr = allocTempBuffers()); // This uses _srcVFX and _dstVFX and allocates one buffer to be a temporary for src and dst
+  #endif // ALLOC_TEMP_BUFFERS_AT_RUN_TIME
+
+  _inited = true;
+
+bail:
+  return vfxErr;
+}
+
+FXApp::Err FXApp::processImage(const char *inFile, const char *outFile) {
+  CUstream      stream  = 0;
+  NvCV_Status   vfxErr;
+
+  if (!_eff)
+    return errEffect;
+  _srcImg = cv::imread(inFile);
+  if (!_srcImg.data)
+    return errRead;
+
+  BAIL_IF_ERR(vfxErr = allocBuffers(_srcImg.cols, _srcImg.rows));
+
+  // Since images are uploaded asynchronously, we may as well do this first.
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_srcGpuBuf, 1.f/255.f, stream, &_tmpVFX)); // _srcVFX--> _tmpVFX --> _srcGpuBuf
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE,  &_srcGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, &_dstGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetCudaStream(_eff, NVVFX_CUDA_STREAM, stream));
+  if (!strcmp(_effectName, NVVFX_FX_ARTIFACT_REDUCTION)) {
+    BAIL_IF_ERR(vfxErr = NvVFX_SetU32(_eff, NVVFX_MODE, (unsigned int)FLAG_mode));
+  } else if (!strcmp(_effectName, NVVFX_FX_SUPER_RES)) {
+    BAIL_IF_ERR(vfxErr = NvVFX_SetU32(_eff, NVVFX_MODE, (unsigned int)FLAG_mode));
+  }
+
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_eff));
+  BAIL_IF_ERR(vfxErr = NvVFX_Run(_eff, 0));                                                   // _srcGpuBuf --> _dstGpuBuf
+  BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_dstGpuBuf, &_dstVFX, 255.f, stream, &_tmpVFX));   // _dstGpuBuf --> _tmpVFX --> _dstVFX
+
+  if (outFile && outFile[0]) {
+    if(IsLossyImageFile(outFile))
+      fprintf(stderr, "WARNING: JPEG output file format will reduce image quality\n");
+    if (!cv::imwrite(outFile, _dstImg)) {
+      printf("Error writing: \"%s\"\n", outFile);
+      return errWrite;
+    }
+  }
+  if (_show) {
+    cv::imshow("Output", _dstImg);
+    cv::waitKey(3000);
+  }
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+FXApp::Err FXApp::processMovie(const char *inFile, const char *outFile) {
+  const int       fourcc_h264 = cv::VideoWriter::fourcc('H','2','6','4');
+  CUstream        stream      = 0;
+  FXApp::Err      appErr      = errNone;
+  bool            ok;
+  cv::VideoCapture reader;
+  cv::VideoWriter writer;
+  NvCV_Status     vfxErr;
+  unsigned        frameNum;
+  VideoInfo       info;
+
+  if (inFile && !inFile[0]) inFile = nullptr;  // Set file paths to NULL if zero length
+
+  if (!FLAG_webcam && inFile) {
+    reader.open(inFile);
+  } else {
+    appErr = initCamera(reader);
+    if (appErr != errNone)
+      return appErr;
+  }
+
+  if (!reader.isOpened()) {
+    if (!FLAG_webcam) printf("Error: Could not open video: \"%s\"\n", inFile);
+    else              printf("Error: Webcam not found\n");
+    return errRead;
+  }
+
+  GetVideoInfo(reader, (inFile ? inFile : "webcam"), &info);
+  if (!(fourcc_h264 == info.codec || cv::VideoWriter::fourcc('a', 'v', 'c', '1') == info.codec)) // avc1 is alias for h264
+    printf("Filters only target H264 videos, not %.4s\n", (char*)&info.codec);
+
+  BAIL_IF_ERR(vfxErr = allocBuffers(info.width, info.height));
+
+  if (outFile && !outFile[0]) outFile = nullptr;
+  if (outFile) {
+    ok = writer.open(outFile, StringToFourcc(FLAG_codec), info.frameRate, cv::Size(_dstVFX.width, _dstVFX.height));
+    if (!ok) {
+      printf("Cannot open \"%s\" for video writing\n", outFile);
+      outFile = nullptr;
+      if (!_show)
+        return errWrite;
+    }
+  }
+
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_INPUT_IMAGE,  &_srcGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetImage(_eff, NVVFX_OUTPUT_IMAGE, &_dstGpuBuf));
+  BAIL_IF_ERR(vfxErr = NvVFX_SetCudaStream(_eff, NVVFX_CUDA_STREAM, stream));
+  if (!strcmp(_effectName, NVVFX_FX_ARTIFACT_REDUCTION)) {
+    BAIL_IF_ERR(vfxErr = NvVFX_SetU32(_eff, NVVFX_MODE, (unsigned int)FLAG_mode));
+  } else if (!strcmp(_effectName, NVVFX_FX_SUPER_RES)) {
+    BAIL_IF_ERR(vfxErr = NvVFX_SetU32(_eff, NVVFX_MODE, (unsigned int)FLAG_mode));
+  }
+  BAIL_IF_ERR(vfxErr = NvVFX_Load(_eff));
+
+  for (frameNum = 0; reader.read(_srcImg); ++frameNum) {
+    if (_srcImg.empty()) {
+      printf("Frame %u is empty\n", frameNum);
+    }
+
+    // _srcVFX   --> _srcTmpVFX --> _srcGpuBuf --> _dstGpuBuf --> _dstTmpVFX --> _dstVFX
+    if (_enableEffect) {
+      BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_srcGpuBuf, 1.f / 255.f, stream, &_tmpVFX));
+      BAIL_IF_ERR(vfxErr = NvVFX_Run(_eff, 0));
+      BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_dstGpuBuf, &_dstVFX, 255.f, stream, &_tmpVFX));
+    } else {
+      BAIL_IF_ERR(vfxErr = NvCVImage_Transfer(&_srcVFX, &_dstVFX, 1.f / 255.f, stream, &_tmpVFX));
+    }
+
+    if (outFile)
+      writer.write(_dstImg);
+
+    if (_show) {
+      drawFrameRate(_dstImg);
+      cv::imshow("Output", _dstImg);
+      int key= cv::waitKey(1);
+      if (key > 0) {
+          appErr = processKey(key);
+          if (errQuit == appErr)
+            break;
+      }
+    }
+    if (_progress)
+      fprintf(stderr, "\b\b\b\b%3.0f%%", 100.f * frameNum / info.frameCount);
+  }
+
+  if (_progress) fprintf(stderr, "\n");
+  reader.release();
+  if (outFile)
+    writer.release();
+bail:
+  return appErrFromVfxStatus(vfxErr);
+}
+
+int main(int argc, char **argv) {
+  FXApp::Err  fxErr = FXApp::errNone;
+  int         nErrs;
+  FXApp       app;
+
+  nErrs = ParseMyArgs(argc, argv);
+  if (nErrs)
+    std::cerr << nErrs << " command line syntax problems\n";
+
+  if (FLAG_verbose) {
+    const char *cstr = nullptr;
+    NvVFX_GetString(nullptr, NVVFX_INFO, &cstr);
+    std::cerr << "Effects:" << std::endl << cstr << std::endl;
+  }
+  if (FLAG_webcam) {
+    // If webcam is on, enable showing the results and turn off displaying the progress
+    if (FLAG_progress) FLAG_progress = !FLAG_progress;
+    if (!FLAG_show)     FLAG_show = !FLAG_show;
+  }
+  if (FLAG_inFile.empty() && !FLAG_webcam) {
+    std::cerr << "Please specify --in_file=XXX or --webcam=true\n";
+    ++nErrs;
+  }
+  if (FLAG_outFile.empty() && !FLAG_show) {
+    std::cerr << "Please specify --out_file=XXX or --show\n";
+    ++nErrs;
+  }
+  if (FLAG_effect.empty()) {
+    std::cerr << "Please specify --effect=XXX\n";
+    ++nErrs;
+  }
+  app._progress = FLAG_progress;
+  app.setShow(FLAG_show);
+
+  if (nErrs) {
+    Usage();
+    fxErr = FXApp::errFlag;
+  }
+  else {
+    fxErr = app.createEffect(FLAG_effect.c_str(), FLAG_modelDir.c_str());
+    if (FXApp::errNone != fxErr) {
+      std::cerr << "Error creating effect \"" << FLAG_effect << "\"\n";
+    }
+    else {
+      if (IsImageFile(FLAG_inFile.c_str()))
+        fxErr = app.processImage(FLAG_inFile.c_str(), FLAG_outFile.c_str());
+      else
+        fxErr = app.processMovie(FLAG_inFile.c_str(), FLAG_outFile.c_str());
+    }
+  }
+
+  if (fxErr)
+    std::cerr << "Error: " << app.errorStringFromCode(fxErr) << std::endl;
+  return (int)fxErr;
+}
--- a/samples/VideoEffectsApp/VideoEffectsApp.exe
+++ b/samples/VideoEffectsApp/VideoEffectsApp.exe
--- a/samples/VideoEffectsApp/run.bat
+++ b/samples/VideoEffectsApp/run.bat
@ -0,0 +1,7 @@
+SETLOCAL
+SET PATH=%PATH%;..\external\opencv\bin;
+REM Use --show to show the output in a window or use --out_file=<filename> to write output to file
+VideoEffectsApp.exe --in_file=..\input\input1.jpg --out_file=ar_1.png --effect=ArtifactReduction --mode=1 --show
+VideoEffectsApp.exe --in_file=..\input\input1.jpg --out_file=ar_0.png --effect=ArtifactReduction --mode=0 --show
+VideoEffectsApp.exe --in_file=..\input\input2.jpg --out_file=sr_0.png --effect=SuperRes --resolution=2160 --mode=0 --show
+VideoEffectsApp.exe --in_file=..\input\input2.jpg --out_file=sr_1.png --effect=SuperRes --resolution=2160 --mode=1 --show
--- a/samples/external/CMakeLists.txt
+++ b/samples/external/CMakeLists.txt
@ -0,0 +1,60 @@
+#######################
+# Interface to OpenCV #
+#######################
+
+if(MSVC)
+    if(CMAKE_CL_64)
+        set(OpenCV_ARCH x64)
+    elseif((CMAKE_GENERATOR MATCHES "ARM") OR ("${arch_hint}" STREQUAL "ARM") OR (CMAKE_VS_EFFECTIVE_PLATFORMS MATCHES "ARM|arm"))
+        # see Modules/CmakeGenericSystem.cmake
+        set(OpenCV_ARCH ARM)
+    else()
+        set(OpenCV_ARCH x86)
+    endif()
+    if(MSVC_VERSION GREATER_EQUAL 1920)
+        #set(OpenCV_RUNTIME vc16)
+        message("No Visual Studio 2019 OpenCV library available; trying 2017 library instead")
+        set(OpenCV_RUNTIME vc15)
+    elseif(MSVC_VERSION GREATER_EQUAL 1910)
+        set(OpenCV_RUNTIME vc15)
+    elseif(MSVC_VERSION GREATER_EQUAL 1900)
+        set(OpenCV_RUNTIME vc14)
+    else()
+        message("MSVC_VERSION ${MSVC_VERSION} is not accommodated")
+    endif()
+
+    add_library(opencv346 INTERFACE)
+    set(OpenCV_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/opencv/include ${CMAKE_CURRENT_SOURCE_DIR}/opencv/include/opencv2)
+    target_include_directories(opencv346 INTERFACE ${OpenCV_INCLUDE_DIR})
+    target_link_libraries(opencv346 INTERFACE optimized ${CMAKE_CURRENT_SOURCE_DIR}/opencv/lib/opencv_world346.lib)
+else()
+
+    find_package(OpenCV REQUIRED
+        PATHS /usr /usr/local
+        PATH_SUFFIXES share/OpenCV share/opencv4)
+    add_library(OpenCV INTERFACE)
+    target_include_directories(OpenCV INTERFACE ${OpenCV_INCLUDE_DIRS})
+    target_link_libraries(OpenCV INTERFACE ${OpenCV_LIBRARIES})
+
+    message("OpenCV_INCLUDE_DIRS ${OpenCV_INCLUDE_DIRS}")
+    message("OpenCV_LIBRARIES ${OpenCV_LIBRARIES}")
+    message("OpenCV_LIBS ${OpenCV_LIBS}")
+
+    find_package(CUDA 11.3 REQUIRED)
+    add_library(CUDA INTERFACE)
+    target_include_directories(CUDA INTERFACE ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(CUDA INTERFACE "${CUDA_LIBRARIES};cuda")
+
+    message("CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}")
+    message("CUDA_LIBRARIES ${CUDA_LIBRARIES}")
+
+    find_package(TensorRT 8 REQUIRED)
+    add_library(TensorRT INTERFACE)
+    target_include_directories(TensorRT INTERFACE ${TensorRT_INCLUDE_DIRS})
+    target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})
+
+    message("TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIRS}")
+    message("TensorRT_LIBRARIES ${TensorRT_LIBRARIES}")
+
+
+endif()
--- a/samples/external/ThirdPartyLicenses.txt
+++ b/samples/external/ThirdPartyLicenses.txt
@ -0,0 +1,42 @@
+--- opencv v3.4.6 ----
+By downloading, copying, installing or using the software you agree to this license.
+If you do not agree to this license, do not download, install,
+copy or use the software.
+
+
+                          License Agreement
+               For Open Source Computer Vision Library
+                       (3-clause BSD License)
+
+Copyright (C) 2000-2018, Intel Corporation, all rights reserved.
+Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
+Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
+Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
+Third party copyrights are property of their respective owners.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  * Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+  * Neither the names of the copyright holders nor the names of the contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and
+any express or implied warranties, including, but not limited to, the implied
+warranties of merchantability and fitness for a particular purpose are disclaimed.
+In no event shall copyright holders or contributors be liable for any direct,
+indirect, incidental, special, exemplary, or consequential damages
+(including, but not limited to, procurement of substitute goods or services;
+loss of use, data, or profits; or business interruption) however caused
+and on any theory of liability, whether in contract, strict liability,
+or tort (including negligence or otherwise) arising in any way out of
+the use of this software, even if advised of the possibility of such damage.
--- a/samples/external/cuda/include/builtin_types.h
+++ b/samples/external/cuda/include/builtin_types.h
@ -0,0 +1,64 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "device_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "driver_types.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "surface_types.h"
+#include "texture_types.h"
+#include "vector_types.h"
--- a/samples/external/cuda/include/channel_descriptor.h
+++ b/samples/external/cuda/include/channel_descriptor.h
@ -0,0 +1,595 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+
+#if defined(__cplusplus)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
+ * ::cudaChannelFormatKindSignedNormalized8X4,
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
+ * ::cudaChannelFormatKindSignedNormalized16X4,
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
+ * ::cudaChannelFormatKindUnsignedNormalized16X4
+ * or ::cudaChannelFormatKindNV12.
+ *
+ * The format is specified by the template specialization.
+ *
+ * The template function specializes for the following scalar types:
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
+ * The template function specializes for the following vector types:
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
+ * The template function specializes for following cudaChannelFormatKind enum values:
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
+ *
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#if !defined(__LP64__)
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#endif /* !__LP64__ */
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+    int e = (int)sizeof(char) * 8;
+
+    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+
+template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+/* Signed 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+
+/* Unsigned 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+
+/* Signed 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+
+/* Unsigned 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+
+/* NV12 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+
+/* BC1 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+
+/* BC1sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+
+/* BC2 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+
+/* BC2sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+
+/* BC3 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+
+/* BC3sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+
+/* BC4 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+
+/* BC4 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+
+/* BC5 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+
+/* BC5 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+
+/* BC6H unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+
+/* BC6H signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+
+/* BC7 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+
+/* BC7sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+
+#endif /* __cplusplus */
+
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */
--- a/samples/external/cuda/include/crt/host_config.h
+++ b/samples/external/cuda/include/crt/host_config.h
@ -0,0 +1,293 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
+
+#if !defined(__HOST_CONFIG_H__)
+#define __HOST_CONFIG_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+
+#define _CRTIMP
+#define __THROW
+
+#else /* __CUDACC_RTC__ */
+
+/* check for host compilers that are compatible with nvcc */
+#if !defined(__GNUC__) && !defined(_WIN32)
+
+#error --- !!! UNSUPPORTED COMPILER !!! ---
+
+#endif /* !__GNUC__ && !_WIN32 */
+
+/* check invalid configurations */
+#if defined(__PGIC__)
+#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
+#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
+#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
+#endif  /* defined(__PGIC__) */
+
+#if defined(__powerpc__)
+#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
+#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
+#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
+#endif /* __powerpc__ */
+
+#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
+#error -- clang and clang++ are the only supported host compilers on Mac OS X!
+#endif /* __APPLE__ && __MACH__ && !__clang__ */
+
+
+/* check host compiler version  */
+#if !__NV_NO_HOST_COMPILER_CHECK
+
+#if defined(__ICC)
+
+#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
+
+#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+ 
+#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
+
+#endif /* __ICC */
+
+#if defined(__powerpc__)
+
+#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
+                              !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
+
+#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
+                           !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
+
+#endif /* __powerpc__ */
+
+#if defined(__GNUC__)
+
+#if __GNUC__ > 11
+
+#error -- unsupported GNU version! gcc versions later than 11 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __GNUC__ > 11 */
+
+
+#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__)
+
+#if (__clang_major__ >= 14) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported clang version! clang version must be less than 14 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif  /* (__clang_major__ >=  14) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+
+#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) */
+
+
+#endif /* __GNUC__ */
+
+#if defined(_WIN32)
+
+#if _MSC_VER < 1910 || _MSC_VER >= 1940
+
+#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#elif _MSC_VER >= 1910 && _MSC_VER < 1910
+
+#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
+
+#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1940) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
+
+#endif /* _WIN32 */
+#endif  /* !__NV_NO_HOST_COMPILER_CHECK */
+
+
+/* configure host compiler */
+#if defined(__APPLE__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#if defined(__BLOCKS__) /* nvcc does not support closures */
+
+#undef __BLOCKS__
+
+#endif /* __BLOCKS__ */
+
+#elif defined(__ANDROID__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__QNX__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__HORIZON__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__GNUC__)
+
+#define _CRTIMP
+#define _ACRTIMP
+
+#include <features.h> /* for __THROW */
+
+#elif defined(_WIN32)
+
+#if _MSC_VER >= 1500
+
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL \
+        1
+
+#endif /* _MSC_VER >= 1500 */
+
+#if !defined(_CRT_NONSTDC_NO_WARNINGS)
+
+#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_NONSTDC_NO_WARNINGS */
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+
+#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_SECURE_NO_WARNINGS */
+
+#if !defined(NOMINMAX)
+
+#define NOMINMAX /* min and max are part of cuda runtime */
+
+#endif /* !NOMINMAX */
+
+#include <crtdefs.h> /* for _CRTIMP */
+#if _MSC_VER >= 1900
+#include <corecrt.h> /* for _ACRTIMP */
+#endif /* _MSC_VER >= 1900 */
+
+#define __THROW
+
+#endif /* __APPLE__ */
+
+#endif /* __CUDACC_RTC__ */
+
+
+#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
+
+#if __CUDACC_RTC__
+typedef char *va_list;
+#else /* !__CUDACC_RTC__ */
+#include <cstdarg>
+#endif /* __CUDACC_RTC__ */
+
+
+#undef va_start
+#undef va_end
+#undef va_arg
+
+#ifdef __PGIC__
+
+#undef __builtin_va_end
+
+#define va_start(v,l) __builtin_alt_va_start(v,l)
+#define va_end(v) __builtin_va_end(v)
+#define va_arg(v,l) __builtin_alt_va_arg(v,l)
+
+#if (__cplusplus >= 201103L)
+#undef va_copy
+#define va_copy(d,s)  __builtin_va_copy(d,s)
+#endif
+
+#else /* !__PGIC__ */
+
+
+#define va_start(ap, x) (__cu_va_start(&ap, x))
+#define va_end(ap) (__cu_va_end(&ap))
+#define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
+
+#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
+#undef va_copy
+#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
+#endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
+#endif /* __PGIC__ */
+
+#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
+
+
+
+#endif /* __CUDACC__ */
+
+#endif /* !__HOST_CONFIG_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
--- a/samples/external/cuda/include/crt/host_defines.h
+++ b/samples/external/cuda/include/crt/host_defines.h
@ -0,0 +1,246 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
+#endif
+
+#if !defined(__HOST_DEFINES_H__)
+#define __HOST_DEFINES_H__
+
+/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
+#if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
+
+#if defined(__CUDACC_RTC__)
+#define __volatile__ volatile
+#endif /* __CUDACC_RTC__ */
+
+#define __no_return__ \
+        __attribute__((noreturn))
+        
+#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
+/* gcc allows users to define attributes with underscores, 
+   e.g., __attribute__((__noinline__)).
+   Consider a non-CUDA source file (e.g. .cpp) that has the 
+   above attribute specification, and includes this header file. In that case,
+   defining __noinline__ as below  would cause a gcc compilation error.
+   Hence, only define __noinline__ when the code is being processed
+   by a  CUDA compiler component.
+*/   
+#define __noinline__ \
+        __attribute__((noinline))
+#endif /* __CUDACC__  || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
+        
+#define __forceinline__ \
+        __inline__ __attribute__((always_inline))
+#define __align__(n) \
+        __attribute__((aligned(n)))
+#define __thread__ \
+        __thread
+#define __import__
+#define __export__
+#define __cdecl
+#define __annotate__(a) \
+        __attribute__((a))
+#define __location__(a) \
+        __annotate__(a)
+#define CUDARTAPI
+#define CUDARTAPI_CDECL
+
+#elif defined(_MSC_VER)
+
+#if _MSC_VER >= 1400
+
+#define __restrict__ \
+        __restrict
+
+#else /* _MSC_VER >= 1400 */
+
+#define __restrict__
+
+#endif /* _MSC_VER >= 1400 */
+
+#define __inline__ \
+        __inline
+#define __no_return__ \
+        __declspec(noreturn)
+#define __noinline__ \
+        __declspec(noinline)
+#define __forceinline__ \
+        __forceinline
+#define __align__(n) \
+        __declspec(align(n))
+#define __thread__ \
+        __declspec(thread)
+#define __import__ \
+        __declspec(dllimport)
+#define __export__ \
+        __declspec(dllexport)
+#define __annotate__(a) \
+        __declspec(a)
+#define __location__(a) \
+        __annotate__(__##a##__)
+#define CUDARTAPI \
+        __stdcall
+#define CUDARTAPI_CDECL \
+        __cdecl
+
+#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#define __inline__
+
+#if !defined(__align__)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
+
+#endif /* !__align__ */
+
+#if !defined(CUDARTAPI)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
+
+#endif /* !CUDARTAPI */
+
+#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
+    (defined(_MSC_VER) && _MSC_VER < 1900) || \
+    (!defined(__GNUC__) && !defined(_MSC_VER))
+
+#define __specialization_static \
+        static
+
+#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#define __specialization_static
+
+#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
+
+#undef __annotate__
+#define __annotate__(a)
+
+#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
+
+#define __launch_bounds__(...) \
+        __annotate__(launch_bounds(__VA_ARGS__))
+
+#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
+
+#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
+    defined(__GNUC__) || defined(_WIN64)
+
+#define __builtin_align__(a) \
+        __align__(a)
+
+#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
+
+#define __builtin_align__(a)
+
+#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__  || _WIN64 */
+
+#if defined(__CUDACC__) || !defined(__host__)
+#define __host__ \
+        __location__(host)
+#endif /* defined(__CUDACC__) || !defined(__host__) */
+#if defined(__CUDACC__) || !defined(__device__)
+#define __device__ \
+        __location__(device)
+#endif /* defined(__CUDACC__) || !defined(__device__) */
+#if defined(__CUDACC__) || !defined(__global__)
+#define __global__ \
+        __location__(global)
+#endif /* defined(__CUDACC__) || !defined(__global__) */
+#if defined(__CUDACC__) || !defined(__shared__)
+#define __shared__ \
+        __location__(shared)
+#endif /* defined(__CUDACC__) || !defined(__shared__) */
+#if defined(__CUDACC__) || !defined(__constant__)
+#define __constant__ \
+        __location__(constant)
+#endif /* defined(__CUDACC__) || !defined(__constant__) */
+#if defined(__CUDACC__) || !defined(__managed__)
+#define __managed__ \
+        __location__(managed)
+#endif /* defined(__CUDACC__) || !defined(__managed__) */
+        
+#if !defined(__CUDACC__)
+#define __device_builtin__
+#define __device_builtin_texture_type__
+#define __device_builtin_surface_type__
+#define __cudart_builtin__
+#else /* defined(__CUDACC__) */
+#define __device_builtin__ \
+        __location__(device_builtin)
+#define __device_builtin_texture_type__ \
+        __location__(device_builtin_texture_type)
+#define __device_builtin_surface_type__ \
+        __location__(device_builtin_surface_type)
+#define __cudart_builtin__ \
+        __location__(cudart_builtin)
+#endif /* !defined(__CUDACC__) */
+
+
+#endif /* !__HOST_DEFINES_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
+#endif
--- a/samples/external/cuda/include/cuda_device_runtime_api.h
+++ b/samples/external/cuda/include/cuda_device_runtime_api.h
@ -0,0 +1,265 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
+#define __CUDA_DEVICE_RUNTIME_API_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+
+#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudaFuncAttributes;
+
+
+inline __device__  cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) 
+{ 
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
+{ 
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaGetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) &&  !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
+
+#endif /* !defined(__CUDACC_RTC__) */
+
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+# define __DEPRECATED__(msg)
+#elif defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
+# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated and will not be supported in a future release. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
+#else
+# define __CDPRT_DEPRECATED(func_name)
+#endif
+
+#if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
+
+#include "driver_types.h"
+#include "crt/host_defines.h"
+
+extern "C"
+{
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Obtains a parameter buffer
+ *
+ * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
+ * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch kernels.
+ *
+ * \param alignment - Specifies alignment requirement of the parameter buffer
+ * \param size      - Specifies size requirement in bytes
+ *
+ * \return
+ * Returns pointer to the allocated parameterBuffer
+ * \notefnerr
+ *
+ * \sa cudaLaunchDevice
+ */
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Launches a specified kernel
+ *
+ * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
+ * by calling ::cudaGetParameterBuffer().
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch the kernels.
+ *
+ * \param func            - Pointer to the kernel to be launched
+ * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
+ * \param gridDimension   - Specifies grid dimensions
+ * \param blockDimension  - Specifies block dimensions
+ * \param sharedMemSize   - Specifies size of shared memory
+ * \param stream          - Specifies the stream to be used
+ *
+ * \return
+ * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
+ * \notefnerr
+ * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
+ * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
+ *
+ * \sa cudaGetParameterBuffer
+ */
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+    // When compiling for the device and per thread default stream is enabled, add
+    // a static inline redirect to the per thread stream entry points.
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
+    }
+#else
+    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+#endif
+
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
+}
+
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+
+#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
+#endif /* defined(__cplusplus) && defined(__CUDACC__) */
+
+#undef __DEPRECATED__
+#undef __CDPRT_DEPRECATED
+
+#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
--- a/samples/external/cuda/include/cuda_runtime.h
+++ b/samples/external/cuda/include/cuda_runtime.h
--- a/samples/external/cuda/include/cuda_runtime_api.h
+++ b/samples/external/cuda/include/cuda_runtime_api.h
--- a/samples/external/cuda/include/device_types.h
+++ b/samples/external/cuda/include/device_types.h
@ -0,0 +1,81 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_TYPES_H__)
+#define __DEVICE_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+enum __device_builtin__ cudaRoundMode
+{
+    cudaRoundNearest,
+    cudaRoundZero,
+    cudaRoundPosInf,
+    cudaRoundMinInf
+};
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#endif /* !__DEVICE_TYPES_H__ */
--- a/samples/external/cuda/include/driver_functions.h
+++ b/samples/external/cuda/include/driver_functions.h
@ -0,0 +1,145 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+
+#include "builtin_types.h"
+#include "crt/host_defines.h"
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
+{
+  struct cudaPitchedPtr s;
+
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+
+  return s;
+}
+
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
+{
+  struct cudaPos p;
+
+  p.x = x;
+  p.y = y;
+  p.z = z;
+
+  return p;
+}
+
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
+{
+  struct cudaExtent e;
+
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+
+  return e;
+}
+
+/** @} */ /* END CUDART_MEMORY */
+
+#endif /* !__DRIVER_FUNCTIONS_H__ */
--- a/samples/external/cuda/include/driver_types.h
+++ b/samples/external/cuda/include/driver_types.h
--- a/samples/external/cuda/include/host_config.h
+++ b/samples/external/cuda/include/host_config.h
@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
+
+#include "crt/host_config.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
--- a/samples/external/cuda/include/host_defines.h
+++ b/samples/external/cuda/include/host_defines.h
@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
+
+#include "crt/host_defines.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
--- a/samples/external/cuda/include/library_types.h
+++ b/samples/external/cuda/include/library_types.h
@ -0,0 +1,100 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__LIBRARY_TYPES_H__)
+#define __LIBRARY_TYPES_H__
+
+
+typedef enum cudaDataType_t
+{
+	CUDA_R_16F  =  2, /* real as a half */
+	CUDA_C_16F  =  6, /* complex as a pair of half numbers */
+	CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+	CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+	CUDA_R_32F  =  0, /* real as a float */
+	CUDA_C_32F  =  4, /* complex as a pair of float numbers */
+	CUDA_R_64F  =  1, /* real as a double */
+	CUDA_C_64F  =  5, /* complex as a pair of double numbers */
+	CUDA_R_4I   = 16, /* real as a signed 4-bit int */
+	CUDA_C_4I   = 17, /* complex as a pair of signed 4-bit int numbers */
+	CUDA_R_4U   = 18, /* real as a unsigned 4-bit int */
+	CUDA_C_4U   = 19, /* complex as a pair of unsigned 4-bit int numbers */
+	CUDA_R_8I   =  3, /* real as a signed 8-bit int */
+	CUDA_C_8I   =  7, /* complex as a pair of signed 8-bit int numbers */
+	CUDA_R_8U   =  8, /* real as a unsigned 8-bit int */
+	CUDA_C_8U   =  9, /* complex as a pair of unsigned 8-bit int numbers */
+	CUDA_R_16I  = 20, /* real as a signed 16-bit int */
+	CUDA_C_16I  = 21, /* complex as a pair of signed 16-bit int numbers */
+	CUDA_R_16U  = 22, /* real as a unsigned 16-bit int */
+	CUDA_C_16U  = 23, /* complex as a pair of unsigned 16-bit int numbers */
+	CUDA_R_32I  = 10, /* real as a signed 32-bit int */
+	CUDA_C_32I  = 11, /* complex as a pair of signed 32-bit int numbers */
+	CUDA_R_32U  = 12, /* real as a unsigned 32-bit int */
+	CUDA_C_32U  = 13, /* complex as a pair of unsigned 32-bit int numbers */
+	CUDA_R_64I  = 24, /* real as a signed 64-bit int */
+	CUDA_C_64I  = 25, /* complex as a pair of signed 64-bit int numbers */
+	CUDA_R_64U  = 26, /* real as a unsigned 64-bit int */
+	CUDA_C_64U  = 27  /* complex as a pair of unsigned 64-bit int numbers */
+} cudaDataType;
+
+
+typedef enum libraryPropertyType_t
+{
+	MAJOR_VERSION,
+	MINOR_VERSION,
+	PATCH_LEVEL
+} libraryPropertyType;
+
+
+#ifndef __cplusplus
+typedef enum cudaDataType_t cudaDataType_t;
+typedef enum libraryPropertyType_t libraryPropertyType_t;
+#endif
+
+#endif /* !__LIBRARY_TYPES_H__ */
--- a/samples/external/cuda/include/surface_types.h
+++ b/samples/external/cuda/include/surface_types.h
@ -0,0 +1,119 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_TYPES_H__)
+#define __SURFACE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaSurfaceType1D              0x01
+#define cudaSurfaceType2D              0x02
+#define cudaSurfaceType3D              0x03
+#define cudaSurfaceTypeCubemap         0x0C
+#define cudaSurfaceType1DLayered       0xF1
+#define cudaSurfaceType2DLayered       0xF2
+#define cudaSurfaceTypeCubemapLayered  0xFC
+
+/**
+ * CUDA Surface boundary modes
+ */
+enum __device_builtin__ cudaSurfaceBoundaryMode
+{
+    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
+    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
+    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
+};
+
+/**
+ * CUDA Surface format modes
+ */
+enum __device_builtin__  cudaSurfaceFormatMode
+{
+    cudaFormatModeForced = 0,     /**< Forced format mode */
+    cudaFormatModeAuto = 1        /**< Auto format mode */
+};
+
+/**
+ * CUDA Surface reference
+ */
+struct __device_builtin__ surfaceReference
+{
+    /**
+     * Channel descriptor for surface reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+};
+
+/**
+ * An opaque value that represents a CUDA Surface object
+ */
+typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif /* !__SURFACE_TYPES_H__ */
--- a/samples/external/cuda/include/texture_types.h
+++ b/samples/external/cuda/include/texture_types.h
@ -0,0 +1,229 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+
+/**
+ * CUDA texture reference
+ */
+struct __device_builtin__ textureReference
+{
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                          normalized;
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode   filterMode;
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode  addressMode[3];
+    /**
+     * Channel descriptor for the texture reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                          sRGB;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                 maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode   mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                        mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                        minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                        maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                          disableTrilinearOptimization;
+    int                          __cudaReserved[14];
+};
+
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+    /**
+     * Enable seamless cube map filtering.
+     */
+    int                         seamlessCubemap;
+};
+
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif /* !__TEXTURE_TYPES_H__ */
--- a/samples/external/cuda/include/vector_functions.h
+++ b/samples/external/cuda/include/vector_functions.h
@ -0,0 +1,175 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_H__)
+#define __VECTOR_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__)
+#include "vector_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__VECTOR_FUNCTIONS_H__ */
--- a/samples/external/cuda/include/vector_functions.hpp
+++ b/samples/external/cuda/include/vector_functions.hpp
@ -0,0 +1,316 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_HPP__)
+#define __VECTOR_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
+{
+  char1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
+{
+  uchar1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
+{
+  char2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+  uchar2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
+{
+  char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
+{
+  short1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
+{
+  ushort1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
+{
+  short2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+  ushort2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
+{ 
+  short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
+{
+  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
+{
+  int1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
+{
+  uint1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
+{
+  int2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+  uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
+{
+  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
+{
+  long1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
+{
+  ulong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
+{
+  long2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+  ulong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
+{
+  long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
+{
+  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
+{
+  float1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
+{
+  float2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
+{
+  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
+{
+  longlong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+  ulonglong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
+{
+  longlong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+  ulonglong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
+{
+  double1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
+{
+  double2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
+{
+  double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
+{
+  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#endif /* !__VECTOR_FUNCTIONS_HPP__ */
+
--- a/samples/external/cuda/include/vector_types.h
+++ b/samples/external/cuda/include/vector_types.h
@ -0,0 +1,443 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_TYPES_H__)
+#define __VECTOR_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
+    defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(push)
+#pragma warning(disable: 4201 4408)
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ tag                      \
+{                                                  \
+    union                                          \
+    {                                              \
+        struct { members };                        \
+        struct { long long int :1,:0; };           \
+    };                                             \
+}
+
+#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ __align__(8) tag         \
+{                                                  \
+    members                                        \
+}
+
+#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+struct __device_builtin__ char1
+{
+    signed char x;
+};
+
+struct __device_builtin__ uchar1
+{
+    unsigned char x;
+};
+
+
+struct __device_builtin__ __align__(2) char2
+{
+    signed char x, y;
+};
+
+struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+};
+
+struct __device_builtin__ char3
+{
+    signed char x, y, z;
+};
+
+struct __device_builtin__ uchar3
+{
+    unsigned char x, y, z;
+};
+
+struct __device_builtin__ __align__(4) char4
+{
+    signed char x, y, z, w;
+};
+
+struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+};
+
+struct __device_builtin__ short1
+{
+    short x;
+};
+
+struct __device_builtin__ ushort1
+{
+    unsigned short x;
+};
+
+struct __device_builtin__ __align__(4) short2
+{
+    short x, y;
+};
+
+struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+};
+
+struct __device_builtin__ short3
+{
+    short x, y, z;
+};
+
+struct __device_builtin__ ushort3
+{
+    unsigned short x, y, z;
+};
+
+__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
+__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
+
+struct __device_builtin__ int1
+{
+    int x;
+};
+
+struct __device_builtin__ uint1
+{
+    unsigned int x;
+};
+
+__cuda_builtin_vector_align8(int2, int x; int y;);
+__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
+
+struct __device_builtin__ int3
+{
+    int x, y, z;
+};
+
+struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) int4
+{
+    int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) uint4
+{
+    unsigned int x, y, z, w;
+};
+
+struct __device_builtin__ long1
+{
+    long int x;
+};
+
+struct __device_builtin__ ulong1
+{
+    unsigned long x;
+};
+
+#if defined(_WIN32)
+__cuda_builtin_vector_align8(long2, long int x; long int y;);
+__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
+#else /* !_WIN32 */
+
+struct __device_builtin__ __align__(2*sizeof(long int)) long2
+{
+    long int x, y;
+};
+
+struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
+{
+    unsigned long int x, y;
+};
+
+#endif /* _WIN32 */
+
+struct __device_builtin__ long3
+{
+    long int x, y, z;
+};
+
+struct __device_builtin__ ulong3
+{
+    unsigned long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) long4
+{
+    long int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulong4
+{
+    unsigned long int x, y, z, w;
+};
+
+struct __device_builtin__ float1
+{
+    float x;
+};
+
+#if !defined(__CUDACC__) && defined(__arm__) && \
+    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-pedantic"
+
+struct __device_builtin__ __attribute__((aligned(8))) float2
+{
+    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
+};
+
+#pragma GCC poison __cuda_gnu_arm_ice_workaround
+#pragma GCC diagnostic pop
+
+#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+__cuda_builtin_vector_align8(float2, float x; float y;);
+
+#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+struct __device_builtin__ float3
+{
+    float x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) float4
+{
+    float x, y, z, w;
+};
+
+struct __device_builtin__ longlong1
+{
+    long long int x;
+};
+
+struct __device_builtin__ ulonglong1
+{
+    unsigned long long int x;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong2
+{
+    long long int x, y;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong2
+{
+    unsigned long long int x, y;
+};
+
+struct __device_builtin__ longlong3
+{
+    long long int x, y, z;
+};
+
+struct __device_builtin__ ulonglong3
+{
+    unsigned long long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong4
+{
+    long long int x, y, z ,w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong4
+{
+    unsigned long long int x, y, z, w;
+};
+
+struct __device_builtin__ double1
+{
+    double x;
+};
+
+struct __device_builtin__ __builtin_align__(16) double2
+{
+    double x, y;
+};
+
+struct __device_builtin__ double3
+{
+    double x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) double4
+{
+    double x, y, z, w;
+};
+
+#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(pop)
+
+#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+typedef __device_builtin__ struct char1 char1;
+typedef __device_builtin__ struct uchar1 uchar1;
+typedef __device_builtin__ struct char2 char2;
+typedef __device_builtin__ struct uchar2 uchar2;
+typedef __device_builtin__ struct char3 char3;
+typedef __device_builtin__ struct uchar3 uchar3;
+typedef __device_builtin__ struct char4 char4;
+typedef __device_builtin__ struct uchar4 uchar4;
+typedef __device_builtin__ struct short1 short1;
+typedef __device_builtin__ struct ushort1 ushort1;
+typedef __device_builtin__ struct short2 short2;
+typedef __device_builtin__ struct ushort2 ushort2;
+typedef __device_builtin__ struct short3 short3;
+typedef __device_builtin__ struct ushort3 ushort3;
+typedef __device_builtin__ struct short4 short4;
+typedef __device_builtin__ struct ushort4 ushort4;
+typedef __device_builtin__ struct int1 int1;
+typedef __device_builtin__ struct uint1 uint1;
+typedef __device_builtin__ struct int2 int2;
+typedef __device_builtin__ struct uint2 uint2;
+typedef __device_builtin__ struct int3 int3;
+typedef __device_builtin__ struct uint3 uint3;
+typedef __device_builtin__ struct int4 int4;
+typedef __device_builtin__ struct uint4 uint4;
+typedef __device_builtin__ struct long1 long1;
+typedef __device_builtin__ struct ulong1 ulong1;
+typedef __device_builtin__ struct long2 long2;
+typedef __device_builtin__ struct ulong2 ulong2;
+typedef __device_builtin__ struct long3 long3;
+typedef __device_builtin__ struct ulong3 ulong3;
+typedef __device_builtin__ struct long4 long4;
+typedef __device_builtin__ struct ulong4 ulong4;
+typedef __device_builtin__ struct float1 float1;
+typedef __device_builtin__ struct float2 float2;
+typedef __device_builtin__ struct float3 float3;
+typedef __device_builtin__ struct float4 float4;
+typedef __device_builtin__ struct longlong1 longlong1;
+typedef __device_builtin__ struct ulonglong1 ulonglong1;
+typedef __device_builtin__ struct longlong2 longlong2;
+typedef __device_builtin__ struct ulonglong2 ulonglong2;
+typedef __device_builtin__ struct longlong3 longlong3;
+typedef __device_builtin__ struct ulonglong3 ulonglong3;
+typedef __device_builtin__ struct longlong4 longlong4;
+typedef __device_builtin__ struct ulonglong4 ulonglong4;
+typedef __device_builtin__ struct double1 double1;
+typedef __device_builtin__ struct double2 double2;
+typedef __device_builtin__ struct double3 double3;
+typedef __device_builtin__ struct double4 double4;
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+struct __device_builtin__ dim3
+{
+    unsigned int x, y, z;
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+    __host__ __device__ constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ constexpr operator uint3(void) const { return uint3{x, y, z}; }
+#else
+    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ operator uint3(void) const { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+#endif
+#endif /* __cplusplus */
+};
+
+typedef __device_builtin__ struct dim3 dim3;
+
+#undef  __cuda_builtin_vector_align8
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+#endif /* !__VECTOR_TYPES_H__ */
--- a/samples/external/cuda/lib/x64/cudart.lib
+++ b/samples/external/cuda/lib/x64/cudart.lib
--- a/samples/external/opencv/bin/opencv_world346.dll
+++ b/samples/external/opencv/bin/opencv_world346.dll
--- a/samples/external/opencv/include/opencv/cv.h
+++ b/samples/external/opencv/include/opencv/cv.h
@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_CV_H
+#define OPENCV_OLD_CV_H
+
+#if defined(_MSC_VER)
+    #define CV_DO_PRAGMA(x) __pragma(x)
+    #define __CVSTR2__(x) #x
+    #define __CVSTR1__(x) __CVSTR2__(x)
+    #define __CVMSVCLOC__ __FILE__ "("__CVSTR1__(__LINE__)") : "
+    #define CV_MSG_PRAGMA(_msg) CV_DO_PRAGMA(message (__CVMSVCLOC__ _msg))
+#elif defined(__GNUC__)
+    #define CV_DO_PRAGMA(x) _Pragma (#x)
+    #define CV_MSG_PRAGMA(_msg) CV_DO_PRAGMA(message (_msg))
+#else
+    #define CV_DO_PRAGMA(x)
+    #define CV_MSG_PRAGMA(_msg)
+#endif
+#define CV_WARNING(x) CV_MSG_PRAGMA("Warning: " #x)
+
+//CV_WARNING("This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module")
+
+#include "opencv2/core/core_c.h"
+#include "opencv2/imgproc/imgproc_c.h"
+#include "opencv2/photo/photo_c.h"
+#include "opencv2/video/tracking_c.h"
+#include "opencv2/objdetect/objdetect_c.h"
+
+#if !defined(CV_IMPL)
+#define CV_IMPL extern "C"
+#endif //CV_IMPL
+
+#endif // __OPENCV_OLD_CV_H_
--- a/samples/external/opencv/include/opencv/cv.hpp
+++ b/samples/external/opencv/include/opencv/cv.hpp
@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_CV_HPP
+#define OPENCV_OLD_CV_HPP
+
+//#if defined(__GNUC__)
+//#warning "This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module"
+//#endif
+
+#include "cv.h"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/photo.hpp"
+#include "opencv2/video.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/calib3d.hpp"
+#include "opencv2/objdetect.hpp"
+
+#endif
--- a/samples/external/opencv/include/opencv/cvaux.h
+++ b/samples/external/opencv/include/opencv/cvaux.h
@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_AUX_H
+#define OPENCV_OLD_AUX_H
+
+//#if defined(__GNUC__)
+//#warning "This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module"
+//#endif
+
+#include "opencv2/core/core_c.h"
+#include "opencv2/imgproc/imgproc_c.h"
+#include "opencv2/photo/photo_c.h"
+#include "opencv2/video/tracking_c.h"
+#include "opencv2/objdetect/objdetect_c.h"
+
+#endif
+
+/* End of file. */
--- a/samples/external/opencv/include/opencv/cvaux.hpp
+++ b/samples/external/opencv/include/opencv/cvaux.hpp
@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_AUX_HPP
+#define OPENCV_OLD_AUX_HPP
+
+//#if defined(__GNUC__)
+//#warning "This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module"
+//#endif
+
+#include "cvaux.h"
+#include "opencv2/core/utility.hpp"
+
+#endif
--- a/samples/external/opencv/include/opencv/cvwimage.h
+++ b/samples/external/opencv/include/opencv/cvwimage.h
@ -0,0 +1,46 @@
+///////////////////////////////////////////////////////////////////////////////
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to
+//  this license.  If you do not agree to this license, do not download,
+//  install, copy or use the software.
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2008, Google, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//  * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//  * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//  * The name of Intel Corporation or contributors may not be used to endorse
+//     or promote products derived from this software without specific
+//     prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is"
+// and any express or implied warranties, including, but not limited to, the
+// implied warranties of merchantability and fitness for a particular purpose
+// are disclaimed. In no event shall the Intel Corporation or contributors be
+// liable for any direct, indirect, incidental, special, exemplary, or
+// consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+
+
+#ifndef OPENCV_OLD_WIMAGE_HPP
+#define OPENCV_OLD_WIMAGE_HPP
+
+#include "opencv2/core/wimage.hpp"
+
+#endif
--- a/samples/external/opencv/include/opencv/cxcore.h
+++ b/samples/external/opencv/include/opencv/cxcore.h
@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_CXCORE_H
+#define OPENCV_OLD_CXCORE_H
+
+//#if defined(__GNUC__)
+//#warning "This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module"
+//#endif
+
+#include "opencv2/core/core_c.h"
+
+#endif
--- a/samples/external/opencv/include/opencv/cxcore.hpp
+++ b/samples/external/opencv/include/opencv/cxcore.hpp
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_CXCORE_HPP
+#define OPENCV_OLD_CXCORE_HPP
+
+//#if defined(__GNUC__)
+//#warning "This is a deprecated opencv header provided for compatibility. Please include a header from a corresponding opencv module"
+//#endif
+
+#include "cxcore.h"
+#include "opencv2/core.hpp"
+
+#endif
--- a/samples/external/opencv/include/opencv/cxeigen.hpp
+++ b/samples/external/opencv/include/opencv/cxeigen.hpp
@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_EIGEN_HPP
+#define OPENCV_OLD_EIGEN_HPP
+
+#include "opencv2/core/eigen.hpp"
+
+#endif
--- a/samples/external/opencv/include/opencv/cxmisc.h
+++ b/samples/external/opencv/include/opencv/cxmisc.h
@ -0,0 +1,8 @@
+#ifndef OPENCV_OLD_CXMISC_H
+#define OPENCV_OLD_CXMISC_H
+
+#ifdef __cplusplus
+#  include "opencv2/core/utility.hpp"
+#endif
+
+#endif
--- a/samples/external/opencv/include/opencv/highgui.h
+++ b/samples/external/opencv/include/opencv/highgui.h
@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_HIGHGUI_H
+#define OPENCV_OLD_HIGHGUI_H
+
+#include "opencv2/core/core_c.h"
+#include "opencv2/highgui/highgui_c.h"
+
+#endif
--- a/samples/external/opencv/include/opencv/ml.h
+++ b/samples/external/opencv/include/opencv/ml.h
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OLD_ML_H
+#define OPENCV_OLD_ML_H
+
+#include "opencv2/core/core_c.h"
+#include "opencv2/ml.hpp"
+
+#endif
--- a/samples/external/opencv/include/opencv2/calib3d.hpp
+++ b/samples/external/opencv/include/opencv2/calib3d.hpp
--- a/samples/external/opencv/include/opencv2/calib3d/calib3d.hpp
+++ b/samples/external/opencv/include/opencv2/calib3d/calib3d.hpp
@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/calib3d.hpp"
--- a/samples/external/opencv/include/opencv2/calib3d/calib3d_c.h
+++ b/samples/external/opencv/include/opencv2/calib3d/calib3d_c.h
@ -0,0 +1,427 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CALIB3D_C_H
+#define OPENCV_CALIB3D_C_H
+
+#include "opencv2/core/core_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup calib3d_c
+  @{
+  */
+
+/****************************************************************************************\
+*                      Camera Calibration, Pose Estimation and Stereo                    *
+\****************************************************************************************/
+
+typedef struct CvPOSITObject CvPOSITObject;
+
+/* Allocates and initializes CvPOSITObject structure before doing cvPOSIT */
+CVAPI(CvPOSITObject*)  cvCreatePOSITObject( CvPoint3D32f* points, int point_count );
+
+
+/* Runs POSIT (POSe from ITeration) algorithm for determining 3d position of
+   an object given its model and projection in a weak-perspective case */
+CVAPI(void)  cvPOSIT(  CvPOSITObject* posit_object, CvPoint2D32f* image_points,
+                       double focal_length, CvTermCriteria criteria,
+                       float* rotation_matrix, float* translation_vector);
+
+/* Releases CvPOSITObject structure */
+CVAPI(void)  cvReleasePOSITObject( CvPOSITObject**  posit_object );
+
+/* updates the number of RANSAC iterations */
+CVAPI(int) cvRANSACUpdateNumIters( double p, double err_prob,
+                                   int model_points, int max_iters );
+
+CVAPI(void) cvConvertPointsHomogeneous( const CvMat* src, CvMat* dst );
+
+/* Calculates fundamental matrix given a set of corresponding points */
+#define CV_FM_7POINT 1
+#define CV_FM_8POINT 2
+
+#define CV_LMEDS 4
+#define CV_RANSAC 8
+
+#define CV_FM_LMEDS_ONLY  CV_LMEDS
+#define CV_FM_RANSAC_ONLY CV_RANSAC
+#define CV_FM_LMEDS CV_LMEDS
+#define CV_FM_RANSAC CV_RANSAC
+
+enum
+{
+    CV_ITERATIVE = 0,
+    CV_EPNP = 1, // F.Moreno-Noguer, V.Lepetit and P.Fua "EPnP: Efficient Perspective-n-Point Camera Pose Estimation"
+    CV_P3P = 2, // X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
+    CV_DLS = 3 // Joel A. Hesch and Stergios I. Roumeliotis. "A Direct Least-Squares (DLS) Method for PnP"
+};
+
+CVAPI(int) cvFindFundamentalMat( const CvMat* points1, const CvMat* points2,
+                                 CvMat* fundamental_matrix,
+                                 int method CV_DEFAULT(CV_FM_RANSAC),
+                                 double param1 CV_DEFAULT(3.), double param2 CV_DEFAULT(0.99),
+                                 CvMat* status CV_DEFAULT(NULL) );
+
+/* For each input point on one of images
+   computes parameters of the corresponding
+   epipolar line on the other image */
+CVAPI(void) cvComputeCorrespondEpilines( const CvMat* points,
+                                         int which_image,
+                                         const CvMat* fundamental_matrix,
+                                         CvMat* correspondent_lines );
+
+/* Triangulation functions */
+
+CVAPI(void) cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2,
+                                CvMat* projPoints1, CvMat* projPoints2,
+                                CvMat* points4D);
+
+CVAPI(void) cvCorrectMatches(CvMat* F, CvMat* points1, CvMat* points2,
+                             CvMat* new_points1, CvMat* new_points2);
+
+
+/* Computes the optimal new camera matrix according to the free scaling parameter alpha:
+   alpha=0 - only valid pixels will be retained in the undistorted image
+   alpha=1 - all the source image pixels will be retained in the undistorted image
+*/
+CVAPI(void) cvGetOptimalNewCameraMatrix( const CvMat* camera_matrix,
+                                         const CvMat* dist_coeffs,
+                                         CvSize image_size, double alpha,
+                                         CvMat* new_camera_matrix,
+                                         CvSize new_imag_size CV_DEFAULT(cvSize(0,0)),
+                                         CvRect* valid_pixel_ROI CV_DEFAULT(0),
+                                         int center_principal_point CV_DEFAULT(0));
+
+/* Converts rotation vector to rotation matrix or vice versa */
+CVAPI(int) cvRodrigues2( const CvMat* src, CvMat* dst,
+                         CvMat* jacobian CV_DEFAULT(0) );
+
+/* Finds perspective transformation between the object plane and image (view) plane */
+CVAPI(int) cvFindHomography( const CvMat* src_points,
+                             const CvMat* dst_points,
+                             CvMat* homography,
+                             int method CV_DEFAULT(0),
+                             double ransacReprojThreshold CV_DEFAULT(3),
+                             CvMat* mask CV_DEFAULT(0),
+                             int maxIters CV_DEFAULT(2000),
+                             double confidence CV_DEFAULT(0.995));
+
+/* Computes RQ decomposition for 3x3 matrices */
+CVAPI(void) cvRQDecomp3x3( const CvMat *matrixM, CvMat *matrixR, CvMat *matrixQ,
+                           CvMat *matrixQx CV_DEFAULT(NULL),
+                           CvMat *matrixQy CV_DEFAULT(NULL),
+                           CvMat *matrixQz CV_DEFAULT(NULL),
+                           CvPoint3D64f *eulerAngles CV_DEFAULT(NULL));
+
+/* Computes projection matrix decomposition */
+CVAPI(void) cvDecomposeProjectionMatrix( const CvMat *projMatr, CvMat *calibMatr,
+                                         CvMat *rotMatr, CvMat *posVect,
+                                         CvMat *rotMatrX CV_DEFAULT(NULL),
+                                         CvMat *rotMatrY CV_DEFAULT(NULL),
+                                         CvMat *rotMatrZ CV_DEFAULT(NULL),
+                                         CvPoint3D64f *eulerAngles CV_DEFAULT(NULL));
+
+/* Computes d(AB)/dA and d(AB)/dB */
+CVAPI(void) cvCalcMatMulDeriv( const CvMat* A, const CvMat* B, CvMat* dABdA, CvMat* dABdB );
+
+/* Computes r3 = rodrigues(rodrigues(r2)*rodrigues(r1)),
+   t3 = rodrigues(r2)*t1 + t2 and the respective derivatives */
+CVAPI(void) cvComposeRT( const CvMat* _rvec1, const CvMat* _tvec1,
+                         const CvMat* _rvec2, const CvMat* _tvec2,
+                         CvMat* _rvec3, CvMat* _tvec3,
+                         CvMat* dr3dr1 CV_DEFAULT(0), CvMat* dr3dt1 CV_DEFAULT(0),
+                         CvMat* dr3dr2 CV_DEFAULT(0), CvMat* dr3dt2 CV_DEFAULT(0),
+                         CvMat* dt3dr1 CV_DEFAULT(0), CvMat* dt3dt1 CV_DEFAULT(0),
+                         CvMat* dt3dr2 CV_DEFAULT(0), CvMat* dt3dt2 CV_DEFAULT(0) );
+
+/* Projects object points to the view plane using
+   the specified extrinsic and intrinsic camera parameters */
+CVAPI(void) cvProjectPoints2( const CvMat* object_points, const CvMat* rotation_vector,
+                              const CvMat* translation_vector, const CvMat* camera_matrix,
+                              const CvMat* distortion_coeffs, CvMat* image_points,
+                              CvMat* dpdrot CV_DEFAULT(NULL), CvMat* dpdt CV_DEFAULT(NULL),
+                              CvMat* dpdf CV_DEFAULT(NULL), CvMat* dpdc CV_DEFAULT(NULL),
+                              CvMat* dpddist CV_DEFAULT(NULL),
+                              double aspect_ratio CV_DEFAULT(0));
+
+/* Finds extrinsic camera parameters from
+   a few known corresponding point pairs and intrinsic parameters */
+CVAPI(void) cvFindExtrinsicCameraParams2( const CvMat* object_points,
+                                          const CvMat* image_points,
+                                          const CvMat* camera_matrix,
+                                          const CvMat* distortion_coeffs,
+                                          CvMat* rotation_vector,
+                                          CvMat* translation_vector,
+                                          int use_extrinsic_guess CV_DEFAULT(0) );
+
+/* Computes initial estimate of the intrinsic camera parameters
+   in case of planar calibration target (e.g. chessboard) */
+CVAPI(void) cvInitIntrinsicParams2D( const CvMat* object_points,
+                                     const CvMat* image_points,
+                                     const CvMat* npoints, CvSize image_size,
+                                     CvMat* camera_matrix,
+                                     double aspect_ratio CV_DEFAULT(1.) );
+
+#define CV_CALIB_CB_ADAPTIVE_THRESH  1
+#define CV_CALIB_CB_NORMALIZE_IMAGE  2
+#define CV_CALIB_CB_FILTER_QUADS     4
+#define CV_CALIB_CB_FAST_CHECK       8
+
+// Performs a fast check if a chessboard is in the input image. This is a workaround to
+// a problem of cvFindChessboardCorners being slow on images with no chessboard
+// - src: input image
+// - size: chessboard size
+// Returns 1 if a chessboard can be in this image and findChessboardCorners should be called,
+// 0 if there is no chessboard, -1 in case of error
+CVAPI(int) cvCheckChessboard(IplImage* src, CvSize size);
+
+    /* Detects corners on a chessboard calibration pattern */
+CVAPI(int) cvFindChessboardCorners( const void* image, CvSize pattern_size,
+                                    CvPoint2D32f* corners,
+                                    int* corner_count CV_DEFAULT(NULL),
+                                    int flags CV_DEFAULT(CV_CALIB_CB_ADAPTIVE_THRESH+CV_CALIB_CB_NORMALIZE_IMAGE) );
+
+/* Draws individual chessboard corners or the whole chessboard detected */
+CVAPI(void) cvDrawChessboardCorners( CvArr* image, CvSize pattern_size,
+                                     CvPoint2D32f* corners,
+                                     int count, int pattern_was_found );
+
+#define CV_CALIB_USE_INTRINSIC_GUESS  1
+#define CV_CALIB_FIX_ASPECT_RATIO     2
+#define CV_CALIB_FIX_PRINCIPAL_POINT  4
+#define CV_CALIB_ZERO_TANGENT_DIST    8
+#define CV_CALIB_FIX_FOCAL_LENGTH 16
+#define CV_CALIB_FIX_K1  32
+#define CV_CALIB_FIX_K2  64
+#define CV_CALIB_FIX_K3  128
+#define CV_CALIB_FIX_K4  2048
+#define CV_CALIB_FIX_K5  4096
+#define CV_CALIB_FIX_K6  8192
+#define CV_CALIB_RATIONAL_MODEL 16384
+#define CV_CALIB_THIN_PRISM_MODEL 32768
+#define CV_CALIB_FIX_S1_S2_S3_S4  65536
+#define CV_CALIB_TILTED_MODEL  262144
+#define CV_CALIB_FIX_TAUX_TAUY  524288
+#define CV_CALIB_FIX_TANGENT_DIST 2097152
+
+#define CV_CALIB_NINTRINSIC 18
+
+/* Finds intrinsic and extrinsic camera parameters
+   from a few views of known calibration pattern */
+CVAPI(double) cvCalibrateCamera2( const CvMat* object_points,
+                                const CvMat* image_points,
+                                const CvMat* point_counts,
+                                CvSize image_size,
+                                CvMat* camera_matrix,
+                                CvMat* distortion_coeffs,
+                                CvMat* rotation_vectors CV_DEFAULT(NULL),
+                                CvMat* translation_vectors CV_DEFAULT(NULL),
+                                int flags CV_DEFAULT(0),
+                                CvTermCriteria term_crit CV_DEFAULT(cvTermCriteria(
+                                    CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,DBL_EPSILON)) );
+
+/* Computes various useful characteristics of the camera from the data computed by
+   cvCalibrateCamera2 */
+CVAPI(void) cvCalibrationMatrixValues( const CvMat *camera_matrix,
+                                CvSize image_size,
+                                double aperture_width CV_DEFAULT(0),
+                                double aperture_height CV_DEFAULT(0),
+                                double *fovx CV_DEFAULT(NULL),
+                                double *fovy CV_DEFAULT(NULL),
+                                double *focal_length CV_DEFAULT(NULL),
+                                CvPoint2D64f *principal_point CV_DEFAULT(NULL),
+                                double *pixel_aspect_ratio CV_DEFAULT(NULL));
+
+#define CV_CALIB_FIX_INTRINSIC  256
+#define CV_CALIB_SAME_FOCAL_LENGTH 512
+
+/* Computes the transformation from one camera coordinate system to another one
+   from a few correspondent views of the same calibration target. Optionally, calibrates
+   both cameras */
+CVAPI(double) cvStereoCalibrate( const CvMat* object_points, const CvMat* image_points1,
+                               const CvMat* image_points2, const CvMat* npoints,
+                               CvMat* camera_matrix1, CvMat* dist_coeffs1,
+                               CvMat* camera_matrix2, CvMat* dist_coeffs2,
+                               CvSize image_size, CvMat* R, CvMat* T,
+                               CvMat* E CV_DEFAULT(0), CvMat* F CV_DEFAULT(0),
+                               int flags CV_DEFAULT(CV_CALIB_FIX_INTRINSIC),
+                               CvTermCriteria term_crit CV_DEFAULT(cvTermCriteria(
+                                   CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,1e-6)) );
+
+#define CV_CALIB_ZERO_DISPARITY 1024
+
+/* Computes 3D rotations (+ optional shift) for each camera coordinate system to make both
+   views parallel (=> to make all the epipolar lines horizontal or vertical) */
+CVAPI(void) cvStereoRectify( const CvMat* camera_matrix1, const CvMat* camera_matrix2,
+                             const CvMat* dist_coeffs1, const CvMat* dist_coeffs2,
+                             CvSize image_size, const CvMat* R, const CvMat* T,
+                             CvMat* R1, CvMat* R2, CvMat* P1, CvMat* P2,
+                             CvMat* Q CV_DEFAULT(0),
+                             int flags CV_DEFAULT(CV_CALIB_ZERO_DISPARITY),
+                             double alpha CV_DEFAULT(-1),
+                             CvSize new_image_size CV_DEFAULT(cvSize(0,0)),
+                             CvRect* valid_pix_ROI1 CV_DEFAULT(0),
+                             CvRect* valid_pix_ROI2 CV_DEFAULT(0));
+
+/* Computes rectification transformations for uncalibrated pair of images using a set
+   of point correspondences */
+CVAPI(int) cvStereoRectifyUncalibrated( const CvMat* points1, const CvMat* points2,
+                                        const CvMat* F, CvSize img_size,
+                                        CvMat* H1, CvMat* H2,
+                                        double threshold CV_DEFAULT(5));
+
+
+
+/* stereo correspondence parameters and functions */
+
+#define CV_STEREO_BM_NORMALIZED_RESPONSE  0
+#define CV_STEREO_BM_XSOBEL               1
+
+/* Block matching algorithm structure */
+typedef struct CvStereoBMState
+{
+    // pre-filtering (normalization of input images)
+    int preFilterType; // =CV_STEREO_BM_NORMALIZED_RESPONSE now
+    int preFilterSize; // averaging window size: ~5x5..21x21
+    int preFilterCap; // the output of pre-filtering is clipped by [-preFilterCap,preFilterCap]
+
+    // correspondence using Sum of Absolute Difference (SAD)
+    int SADWindowSize; // ~5x5..21x21
+    int minDisparity;  // minimum disparity (can be negative)
+    int numberOfDisparities; // maximum disparity - minimum disparity (> 0)
+
+    // post-filtering
+    int textureThreshold;  // the disparity is only computed for pixels
+                           // with textured enough neighborhood
+    int uniquenessRatio;   // accept the computed disparity d* only if
+                           // SAD(d) >= SAD(d*)*(1 + uniquenessRatio/100.)
+                           // for any d != d*+/-1 within the search range.
+    int speckleWindowSize; // disparity variation window
+    int speckleRange; // acceptable range of variation in window
+
+    int trySmallerWindows; // if 1, the results may be more accurate,
+                           // at the expense of slower processing
+    CvRect roi1, roi2;
+    int disp12MaxDiff;
+
+    // temporary buffers
+    CvMat* preFilteredImg0;
+    CvMat* preFilteredImg1;
+    CvMat* slidingSumBuf;
+    CvMat* cost;
+    CvMat* disp;
+} CvStereoBMState;
+
+#define CV_STEREO_BM_BASIC 0
+#define CV_STEREO_BM_FISH_EYE 1
+#define CV_STEREO_BM_NARROW 2
+
+CVAPI(CvStereoBMState*) cvCreateStereoBMState(int preset CV_DEFAULT(CV_STEREO_BM_BASIC),
+                                              int numberOfDisparities CV_DEFAULT(0));
+
+CVAPI(void) cvReleaseStereoBMState( CvStereoBMState** state );
+
+CVAPI(void) cvFindStereoCorrespondenceBM( const CvArr* left, const CvArr* right,
+                                          CvArr* disparity, CvStereoBMState* state );
+
+CVAPI(CvRect) cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
+                                      int numberOfDisparities, int SADWindowSize );
+
+CVAPI(void) cvValidateDisparity( CvArr* disparity, const CvArr* cost,
+                                 int minDisparity, int numberOfDisparities,
+                                 int disp12MaxDiff CV_DEFAULT(1) );
+
+/* Reprojects the computed disparity image to the 3D space using the specified 4x4 matrix */
+CVAPI(void)  cvReprojectImageTo3D( const CvArr* disparityImage,
+                                   CvArr* _3dImage, const CvMat* Q,
+                                   int handleMissingValues CV_DEFAULT(0) );
+
+/** @} calib3d_c */
+
+#ifdef __cplusplus
+} // extern "C"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+class CV_EXPORTS CvLevMarq
+{
+public:
+    CvLevMarq();
+    CvLevMarq( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    ~CvLevMarq();
+    void init( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    bool update( const CvMat*& param, CvMat*& J, CvMat*& err );
+    bool updateAlt( const CvMat*& param, CvMat*& JtJ, CvMat*& JtErr, double*& errNorm );
+
+    void clear();
+    void step();
+    enum { DONE=0, STARTED=1, CALC_J=2, CHECK_ERR=3 };
+
+    cv::Ptr<CvMat> mask;
+    cv::Ptr<CvMat> prevParam;
+    cv::Ptr<CvMat> param;
+    cv::Ptr<CvMat> J;
+    cv::Ptr<CvMat> err;
+    cv::Ptr<CvMat> JtJ;
+    cv::Ptr<CvMat> JtJN;
+    cv::Ptr<CvMat> JtErr;
+    cv::Ptr<CvMat> JtJV;
+    cv::Ptr<CvMat> JtJW;
+    double prevErrNorm, errNorm;
+    int lambdaLg10;
+    CvTermCriteria criteria;
+    int state;
+    int iters;
+    bool completeSymmFlag;
+    int solveMethod;
+};
+
+#endif
+
+#endif /* OPENCV_CALIB3D_C_H */
--- a/samples/external/opencv/include/opencv2/core.hpp
+++ b/samples/external/opencv/include/opencv2/core.hpp
--- a/samples/external/opencv/include/opencv2/core/affine.hpp
+++ b/samples/external/opencv/include/opencv2/core/affine.hpp
@ -0,0 +1,678 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_AFFINE3_HPP
+#define OPENCV_CORE_AFFINE3_HPP
+
+#ifdef __cplusplus
+
+#include <opencv2/core.hpp>
+
+namespace cv
+{
+
+//! @addtogroup core
+//! @{
+
+    /** @brief Affine transform
+     *
+     * It represents a 4x4 homogeneous transformation matrix \f$T\f$
+     *
+     *  \f[T =
+     *  \begin{bmatrix}
+     *  R & t\\
+     *  0 & 1\\
+     *  \end{bmatrix}
+     *  \f]
+     *
+     *  where \f$R\f$ is a 3x3 rotation matrix and \f$t\f$ is a 3x1 translation vector.
+     *
+     *  You can specify \f$R\f$ either by a 3x3 rotation matrix or by a 3x1 rotation vector,
+     *  which is converted to a 3x3 rotation matrix by the Rodrigues formula.
+     *
+     *  To construct a matrix \f$T\f$ representing first rotation around the axis \f$r\f$ with rotation
+     *  angle \f$|r|\f$ in radian (right hand rule) and then translation by the vector \f$t\f$, you can use
+     *
+     *  @code
+     *  cv::Vec3f r, t;
+     *  cv::Affine3f T(r, t);
+     *  @endcode
+     *
+     *  If you already have the rotation matrix \f$R\f$, then you can use
+     *
+     *  @code
+     *  cv::Matx33f R;
+     *  cv::Affine3f T(R, t);
+     *  @endcode
+     *
+     *  To extract the rotation matrix \f$R\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Matx33f R = T.rotation();
+     *  @endcode
+     *
+     *  To extract the translation vector \f$t\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f t = T.translation();
+     *  @endcode
+     *
+     *  To extract the rotation vector \f$r\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f r = T.rvec();
+     *  @endcode
+     *
+     *  Note that since the mapping from rotation vectors to rotation matrices
+     *  is many to one. The returned rotation vector is not necessarily the one
+     *  you used before to set the matrix.
+     *
+     *  If you have two transformations \f$T = T_1 * T_2\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T1, T2;
+     *  T = T2.concatenate(T1);
+     *  @endcode
+     *
+     *  To get the inverse transform of \f$T\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T_inv;
+     *  T_inv = T.inv();
+     *  @endcode
+     *
+     */
+    template<typename T>
+    class Affine3
+    {
+    public:
+        typedef T float_type;
+        typedef Matx<float_type, 3, 3> Mat3;
+        typedef Matx<float_type, 4, 4> Mat4;
+        typedef Vec<float_type, 3> Vec3;
+
+       //! Default constructor. It represents a 4x4 identity matrix.
+        Affine3();
+
+        //! Augmented affine matrix
+        Affine3(const Mat4& affine);
+
+        /**
+         *  The resulting 4x4 matrix is
+         *
+         *  \f[
+         *  \begin{bmatrix}
+         *  R & t\\
+         *  0 & 1\\
+         *  \end{bmatrix}
+         *  \f]
+         *
+         * @param R 3x3 rotation matrix.
+         * @param t 3x1 translation vector.
+         */
+        Affine3(const Mat3& R, const Vec3& t = Vec3::all(0));
+
+        /**
+         * Rodrigues vector.
+         *
+         * The last row of the current matrix is set to [0,0,0,1].
+         *
+         * @param rvec 3x1 rotation vector. Its direction indicates the rotation axis and its length
+         *             indicates the rotation angle in radian (using right hand rule).
+         * @param t 3x1 translation vector.
+         */
+        Affine3(const Vec3& rvec, const Vec3& t = Vec3::all(0));
+
+        /**
+         * Combines all constructors above. Supports 4x4, 3x4, 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * The last row of the current matrix is set to [0,0,0,1] when data is not 4x4.
+         *
+         * @param data 1-channel matrix.
+         *             when it is 4x4, it is copied to the current matrix and t is not used.
+         *             When it is 3x4, it is copied to the upper part 3x4 of the current matrix and t is not used.
+         *             When it is 3x3, it is copied to the upper left 3x3 part of the current matrix.
+         *             When it is 3x1 or 1x3, it is treated as a rotation vector and the Rodrigues formula is used
+         *                             to compute a 3x3 rotation matrix.
+         * @param t 3x1 translation vector. It is used only when data is neither 4x4 nor 3x4.
+         */
+        explicit Affine3(const Mat& data, const Vec3& t = Vec3::all(0));
+
+        //! From 16-element array
+        explicit Affine3(const float_type* vals);
+
+        //! Create an 4x4 identity transform
+        static Affine3 Identity();
+
+        /**
+         * Rotation matrix.
+         *
+         * Copy the rotation matrix to the upper left 3x3 part of the current matrix.
+         * The remaining elements of the current matrix are not changed.
+         *
+         * @param R 3x3 rotation matrix.
+         *
+         */
+        void rotation(const Mat3& R);
+
+        /**
+         * Rodrigues vector.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param rvec 3x1 rotation vector. The direction indicates the rotation axis and
+         *             its length indicates the rotation angle in radian (using the right thumb convention).
+         */
+        void rotation(const Vec3& rvec);
+
+        /**
+         * Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param data 1-channel matrix.
+         *             When it is a 3x3 matrix, it sets the upper left 3x3 part of the current matrix.
+         *             When it is a 1x3 or 3x1 matrix, it is used as a rotation vector. The Rodrigues formula
+         *             is used to compute the rotation matrix and sets the upper left 3x3 part of the current matrix.
+         */
+        void rotation(const Mat& data);
+
+        /**
+         * Copy the 3x3 matrix L to the upper left part of the current matrix
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param L 3x3 matrix.
+         */
+        void linear(const Mat3& L);
+
+        /**
+         * Copy t to the first three elements of the last column of the current matrix
+         *
+         * It sets the upper right 3x1 part of the matrix. The remaining part is unaffected.
+         *
+         * @param t 3x1 translation vector.
+         */
+        void translation(const Vec3& t);
+
+        //! @return the upper left 3x3 part
+        Mat3 rotation() const;
+
+        //! @return the upper left 3x3 part
+        Mat3 linear() const;
+
+        //! @return the upper right 3x1 part
+        Vec3 translation() const;
+
+        //! Rodrigues vector.
+        //! @return a vector representing the upper left 3x3 rotation matrix of the current matrix.
+        //! @warning  Since the mapping between rotation vectors and rotation matrices is many to one,
+        //!           this function returns only one rotation vector that represents the current rotation matrix,
+        //!           which is not necessarily the same one set by `rotation(const Vec3& rvec)`.
+        Vec3 rvec() const;
+
+        //! @return the inverse of the current matrix.
+        Affine3 inv(int method = cv::DECOMP_SVD) const;
+
+        //! a.rotate(R) is equivalent to Affine(R, 0) * a;
+        Affine3 rotate(const Mat3& R) const;
+
+        //! a.rotate(rvec) is equivalent to Affine(rvec, 0) * a;
+        Affine3 rotate(const Vec3& rvec) const;
+
+        //! a.translate(t) is equivalent to Affine(E, t) * a, where E is an identity matrix
+        Affine3 translate(const Vec3& t) const;
+
+        //! a.concatenate(affine) is equivalent to affine * a;
+        Affine3 concatenate(const Affine3& affine) const;
+
+        template <typename Y> operator Affine3<Y>() const;
+
+        template <typename Y> Affine3<Y> cast() const;
+
+        Mat4 matrix;
+
+#if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
+        Affine3(const Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>& affine);
+        Affine3(const Eigen::Transform<T, 3, Eigen::Affine>& affine);
+        operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>() const;
+        operator Eigen::Transform<T, 3, Eigen::Affine>() const;
+#endif
+    };
+
+    template<typename T> static
+    Affine3<T> operator*(const Affine3<T>& affine1, const Affine3<T>& affine2);
+
+    //! V is a 3-element vector with member fields x, y and z
+    template<typename T, typename V> static
+    V operator*(const Affine3<T>& affine, const V& vector);
+
+    typedef Affine3<float> Affine3f;
+    typedef Affine3<double> Affine3d;
+
+    static Vec3f operator*(const Affine3f& affine, const Vec3f& vector);
+    static Vec3d operator*(const Affine3d& affine, const Vec3d& vector);
+
+    template<typename _Tp> class DataType< Affine3<_Tp> >
+    {
+    public:
+        typedef Affine3<_Tp>                               value_type;
+        typedef Affine3<typename DataType<_Tp>::work_type> work_type;
+        typedef _Tp                                        channel_type;
+
+        enum { generic_type = 0,
+               channels     = 16,
+               fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+               ,depth        = DataType<channel_type>::depth
+               ,type         = CV_MAKETYPE(depth, channels)
+#endif
+             };
+
+        typedef Vec<channel_type, channels> vec_type;
+    };
+
+    namespace traits {
+    template<typename _Tp>
+    struct Depth< Affine3<_Tp> > { enum { value = Depth<_Tp>::value }; };
+    template<typename _Tp>
+    struct Type< Affine3<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 16) }; };
+    } // namespace
+
+//! @} core
+
+}
+
+//! @cond IGNORED
+
+///////////////////////////////////////////////////////////////////////////////////
+// Implementation
+
+template<typename T> inline
+cv::Affine3<T>::Affine3()
+    : matrix(Mat4::eye())
+{}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Mat4& affine)
+    : matrix(affine)
+{}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Mat3& R, const Vec3& t)
+{
+    rotation(R);
+    translation(t);
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Vec3& _rvec, const Vec3& t)
+{
+    rotation(_rvec);
+    translation(t);
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const cv::Mat& data, const Vec3& t)
+{
+    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);
+
+    if (data.cols == 4 && data.rows == 4)
+    {
+        data.copyTo(matrix);
+        return;
+    }
+    else if (data.cols == 4 && data.rows == 3)
+    {
+        rotation(data(Rect(0, 0, 3, 3)));
+        translation(data(Rect(3, 0, 1, 3)));
+    }
+    else
+    {
+        rotation(data);
+        translation(t);
+    }
+
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const float_type* vals) : matrix(vals)
+{}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::Identity()
+{
+    return Affine3<T>(cv::Affine3<T>::Mat4::eye());
+}
+
+template<typename T> inline
+void cv::Affine3<T>::rotation(const Mat3& R)
+{
+    linear(R);
+}
+
+template<typename T> inline
+void cv::Affine3<T>::rotation(const Vec3& _rvec)
+{
+    double theta = norm(_rvec);
+
+    if (theta < DBL_EPSILON)
+        rotation(Mat3::eye());
+    else
+    {
+        double c = std::cos(theta);
+        double s = std::sin(theta);
+        double c1 = 1. - c;
+        double itheta = (theta != 0) ? 1./theta : 0.;
+
+        Point3_<T> r = _rvec*itheta;
+
+        Mat3 rrt( r.x*r.x, r.x*r.y, r.x*r.z, r.x*r.y, r.y*r.y, r.y*r.z, r.x*r.z, r.y*r.z, r.z*r.z );
+        Mat3 r_x( 0, -r.z, r.y, r.z, 0, -r.x, -r.y, r.x, 0 );
+
+        // R = cos(theta)*I + (1 - cos(theta))*r*rT + sin(theta)*[r_x]
+        // where [r_x] is [0 -rz ry; rz 0 -rx; -ry rx 0]
+        Mat3 R = c*Mat3::eye() + c1*rrt + s*r_x;
+
+        rotation(R);
+    }
+}
+
+//Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix;
+template<typename T> inline
+void cv::Affine3<T>::rotation(const cv::Mat& data)
+{
+    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);
+
+    if (data.cols == 3 && data.rows == 3)
+    {
+        Mat3 R;
+        data.copyTo(R);
+        rotation(R);
+    }
+    else if ((data.cols == 3 && data.rows == 1) || (data.cols == 1 && data.rows == 3))
+    {
+        Vec3 _rvec;
+        data.reshape(1, 3).copyTo(_rvec);
+        rotation(_rvec);
+    }
+    else
+        CV_Error(Error::StsError, "Input matrix can only be 3x3, 1x3 or 3x1");
+}
+
+template<typename T> inline
+void cv::Affine3<T>::linear(const Mat3& L)
+{
+    matrix.val[0] = L.val[0]; matrix.val[1] = L.val[1];  matrix.val[ 2] = L.val[2];
+    matrix.val[4] = L.val[3]; matrix.val[5] = L.val[4];  matrix.val[ 6] = L.val[5];
+    matrix.val[8] = L.val[6]; matrix.val[9] = L.val[7];  matrix.val[10] = L.val[8];
+}
+
+template<typename T> inline
+void cv::Affine3<T>::translation(const Vec3& t)
+{
+    matrix.val[3] = t[0]; matrix.val[7] = t[1]; matrix.val[11] = t[2];
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Mat3 cv::Affine3<T>::rotation() const
+{
+    return linear();
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Mat3 cv::Affine3<T>::linear() const
+{
+    typename cv::Affine3<T>::Mat3 R;
+    R.val[0] = matrix.val[0];  R.val[1] = matrix.val[1];  R.val[2] = matrix.val[ 2];
+    R.val[3] = matrix.val[4];  R.val[4] = matrix.val[5];  R.val[5] = matrix.val[ 6];
+    R.val[6] = matrix.val[8];  R.val[7] = matrix.val[9];  R.val[8] = matrix.val[10];
+    return R;
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Vec3 cv::Affine3<T>::translation() const
+{
+    return Vec3(matrix.val[3], matrix.val[7], matrix.val[11]);
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Vec3 cv::Affine3<T>::rvec() const
+{
+    cv::Vec3d w;
+    cv::Matx33d u, vt, R = rotation();
+    cv::SVD::compute(R, w, u, vt, cv::SVD::FULL_UV + cv::SVD::MODIFY_A);
+    R = u * vt;
+
+    double rx = R.val[7] - R.val[5];
+    double ry = R.val[2] - R.val[6];
+    double rz = R.val[3] - R.val[1];
+
+    double s = std::sqrt((rx*rx + ry*ry + rz*rz)*0.25);
+    double c = (R.val[0] + R.val[4] + R.val[8] - 1) * 0.5;
+    c = c > 1.0 ? 1.0 : c < -1.0 ? -1.0 : c;
+    double theta = acos(c);
+
+    if( s < 1e-5 )
+    {
+        if( c > 0 )
+            rx = ry = rz = 0;
+        else
+        {
+            double t;
+            t = (R.val[0] + 1) * 0.5;
+            rx = std::sqrt(std::max(t, 0.0));
+            t = (R.val[4] + 1) * 0.5;
+            ry = std::sqrt(std::max(t, 0.0)) * (R.val[1] < 0 ? -1.0 : 1.0);
+            t = (R.val[8] + 1) * 0.5;
+            rz = std::sqrt(std::max(t, 0.0)) * (R.val[2] < 0 ? -1.0 : 1.0);
+
+            if( fabs(rx) < fabs(ry) && fabs(rx) < fabs(rz) && (R.val[5] > 0) != (ry*rz > 0) )
+                rz = -rz;
+            theta /= std::sqrt(rx*rx + ry*ry + rz*rz);
+            rx *= theta;
+            ry *= theta;
+            rz *= theta;
+        }
+    }
+    else
+    {
+        double vth = 1/(2*s);
+        vth *= theta;
+        rx *= vth; ry *= vth; rz *= vth;
+    }
+
+    return cv::Vec3d(rx, ry, rz);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::inv(int method) const
+{
+    return matrix.inv(method);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Mat3& R) const
+{
+    Mat3 Lc = linear();
+    Vec3 tc = translation();
+    Mat4 result;
+    result.val[12] = result.val[13] = result.val[14] = 0;
+    result.val[15] = 1;
+
+    for(int j = 0; j < 3; ++j)
+    {
+        for(int i = 0; i < 3; ++i)
+        {
+            float_type value = 0;
+            for(int k = 0; k < 3; ++k)
+                value += R(j, k) * Lc(k, i);
+            result(j, i) = value;
+        }
+
+        result(j, 3) = R.row(j).dot(tc.t());
+    }
+    return result;
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Vec3& _rvec) const
+{
+    return rotate(Affine3f(_rvec).rotation());
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::translate(const Vec3& t) const
+{
+    Mat4 m = matrix;
+    m.val[ 3] += t[0];
+    m.val[ 7] += t[1];
+    m.val[11] += t[2];
+    return m;
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::concatenate(const Affine3<T>& affine) const
+{
+    return (*this).rotate(affine.rotation()).translate(affine.translation());
+}
+
+template<typename T> template <typename Y> inline
+cv::Affine3<T>::operator Affine3<Y>() const
+{
+    return Affine3<Y>(matrix);
+}
+
+template<typename T> template <typename Y> inline
+cv::Affine3<Y> cv::Affine3<T>::cast() const
+{
+    return Affine3<Y>(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::operator*(const cv::Affine3<T>& affine1, const cv::Affine3<T>& affine2)
+{
+    return affine2.concatenate(affine1);
+}
+
+template<typename T, typename V> inline
+V cv::operator*(const cv::Affine3<T>& affine, const V& v)
+{
+    const typename Affine3<T>::Mat4& m = affine.matrix;
+
+    V r;
+    r.x = m.val[0] * v.x + m.val[1] * v.y + m.val[ 2] * v.z + m.val[ 3];
+    r.y = m.val[4] * v.x + m.val[5] * v.y + m.val[ 6] * v.z + m.val[ 7];
+    r.z = m.val[8] * v.x + m.val[9] * v.y + m.val[10] * v.z + m.val[11];
+    return r;
+}
+
+static inline
+cv::Vec3f cv::operator*(const cv::Affine3f& affine, const cv::Vec3f& v)
+{
+    const cv::Matx44f& m = affine.matrix;
+    cv::Vec3f r;
+    r.val[0] = m.val[0] * v[0] + m.val[1] * v[1] + m.val[ 2] * v[2] + m.val[ 3];
+    r.val[1] = m.val[4] * v[0] + m.val[5] * v[1] + m.val[ 6] * v[2] + m.val[ 7];
+    r.val[2] = m.val[8] * v[0] + m.val[9] * v[1] + m.val[10] * v[2] + m.val[11];
+    return r;
+}
+
+static inline
+cv::Vec3d cv::operator*(const cv::Affine3d& affine, const cv::Vec3d& v)
+{
+    const cv::Matx44d& m = affine.matrix;
+    cv::Vec3d r;
+    r.val[0] = m.val[0] * v[0] + m.val[1] * v[1] + m.val[ 2] * v[2] + m.val[ 3];
+    r.val[1] = m.val[4] * v[0] + m.val[5] * v[1] + m.val[ 6] * v[2] + m.val[ 7];
+    r.val[2] = m.val[8] * v[0] + m.val[9] * v[1] + m.val[10] * v[2] + m.val[11];
+    return r;
+}
+
+
+
+#if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>& affine)
+{
+    cv::Mat(4, 4, cv::traits::Type<T>::value, affine.matrix().data()).copyTo(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Eigen::Transform<T, 3, Eigen::Affine>& affine)
+{
+    Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)> a = affine;
+    cv::Mat(4, 4, cv::traits::Type<T>::value, a.matrix().data()).copyTo(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T>::operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>() const
+{
+    Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)> r;
+    cv::Mat hdr(4, 4, cv::traits::Type<T>::value, r.matrix().data());
+    cv::Mat(matrix, false).copyTo(hdr);
+    return r;
+}
+
+template<typename T> inline
+cv::Affine3<T>::operator Eigen::Transform<T, 3, Eigen::Affine>() const
+{
+    return this->operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>();
+}
+
+#endif /* defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H */
+
+//! @endcond
+
+#endif /* __cplusplus */
+
+#endif /* OPENCV_CORE_AFFINE3_HPP */
--- a/samples/external/opencv/include/opencv2/core/base.hpp
+++ b/samples/external/opencv/include/opencv2/core/base.hpp
@ -0,0 +1,707 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_BASE_HPP
+#define OPENCV_CORE_BASE_HPP
+
+#ifndef __cplusplus
+#  error base.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/opencv_modules.hpp"
+
+#include <climits>
+#include <algorithm>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+namespace Error {
+//! error codes
+enum Code {
+    StsOk=                       0,  //!< everything is ok
+    StsBackTrace=               -1,  //!< pseudo error for back trace
+    StsError=                   -2,  //!< unknown /unspecified error
+    StsInternal=                -3,  //!< internal error (bad state)
+    StsNoMem=                   -4,  //!< insufficient memory
+    StsBadArg=                  -5,  //!< function arg/param is bad
+    StsBadFunc=                 -6,  //!< unsupported function
+    StsNoConv=                  -7,  //!< iteration didn't converge
+    StsAutoTrace=               -8,  //!< tracing
+    HeaderIsNull=               -9,  //!< image header is NULL
+    BadImageSize=              -10,  //!< image size is invalid
+    BadOffset=                 -11,  //!< offset is invalid
+    BadDataPtr=                -12,  //!<
+    BadStep=                   -13,  //!< image step is wrong, this may happen for a non-continuous matrix.
+    BadModelOrChSeq=           -14,  //!<
+    BadNumChannels=            -15,  //!< bad number of channels, for example, some functions accept only single channel matrices.
+    BadNumChannel1U=           -16,  //!<
+    BadDepth=                  -17,  //!< input image depth is not supported by the function
+    BadAlphaChannel=           -18,  //!<
+    BadOrder=                  -19,  //!< number of dimensions is out of range
+    BadOrigin=                 -20,  //!< incorrect input origin
+    BadAlign=                  -21,  //!< incorrect input align
+    BadCallBack=               -22,  //!<
+    BadTileSize=               -23,  //!<
+    BadCOI=                    -24,  //!< input COI is not supported
+    BadROISize=                -25,  //!< incorrect input roi
+    MaskIsTiled=               -26,  //!<
+    StsNullPtr=                -27,  //!< null pointer
+    StsVecLengthErr=           -28,  //!< incorrect vector length
+    StsFilterStructContentErr= -29,  //!< incorrect filter structure content
+    StsKernelStructContentErr= -30,  //!< incorrect transform kernel content
+    StsFilterOffsetErr=        -31,  //!< incorrect filter offset value
+    StsBadSize=                -201, //!< the input/output structure size is incorrect
+    StsDivByZero=              -202, //!< division by zero
+    StsInplaceNotSupported=    -203, //!< in-place operation is not supported
+    StsObjectNotFound=         -204, //!< request can't be completed
+    StsUnmatchedFormats=       -205, //!< formats of input/output arrays differ
+    StsBadFlag=                -206, //!< flag is wrong or not supported
+    StsBadPoint=               -207, //!< bad CvPoint
+    StsBadMask=                -208, //!< bad format of mask (neither 8uC1 nor 8sC1)
+    StsUnmatchedSizes=         -209, //!< sizes of input/output structures do not match
+    StsUnsupportedFormat=      -210, //!< the data format/type is not supported by the function
+    StsOutOfRange=             -211, //!< some of parameters are out of range
+    StsParseError=             -212, //!< invalid syntax/structure of the parsed file
+    StsNotImplemented=         -213, //!< the requested function/feature is not implemented
+    StsBadMemBlock=            -214, //!< an allocated block has been corrupted
+    StsAssert=                 -215, //!< assertion failed
+    GpuNotSupported=           -216, //!< no CUDA support
+    GpuApiCallError=           -217, //!< GPU API call error
+    OpenGlNotSupported=        -218, //!< no OpenGL support
+    OpenGlApiCallError=        -219, //!< OpenGL API call error
+    OpenCLApiCallError=        -220, //!< OpenCL API call error
+    OpenCLDoubleNotSupported=  -221,
+    OpenCLInitError=           -222, //!< OpenCL initialization error
+    OpenCLNoAMDBlasFft=        -223
+};
+} //Error
+
+//! @} core_utils
+
+//! @addtogroup core_array
+//! @{
+
+//! matrix decomposition types
+enum DecompTypes {
+    /** Gaussian elimination with the optimal pivot element chosen. */
+    DECOMP_LU       = 0,
+    /** singular value decomposition (SVD) method; the system can be over-defined and/or the matrix
+    src1 can be singular */
+    DECOMP_SVD      = 1,
+    /** eigenvalue decomposition; the matrix src1 must be symmetrical */
+    DECOMP_EIG      = 2,
+    /** Cholesky \f$LL^T\f$ factorization; the matrix src1 must be symmetrical and positively
+    defined */
+    DECOMP_CHOLESKY = 3,
+    /** QR factorization; the system can be over-defined and/or the matrix src1 can be singular */
+    DECOMP_QR       = 4,
+    /** while all the previous flags are mutually exclusive, this flag can be used together with
+    any of the previous; it means that the normal equations
+    \f$\texttt{src1}^T\cdot\texttt{src1}\cdot\texttt{dst}=\texttt{src1}^T\texttt{src2}\f$ are
+    solved instead of the original system
+    \f$\texttt{src1}\cdot\texttt{dst}=\texttt{src2}\f$ */
+    DECOMP_NORMAL   = 16
+};
+
+/** norm types
+
+src1 and src2 denote input arrays.
+*/
+
+enum NormTypes {
+                /**
+                \f[
+                norm =  \forkthree
+                {\|\texttt{src1}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I)|}{if  \(\texttt{normType} = \texttt{NORM_INF}\) }
+                {\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  \(\texttt{normType} = \texttt{NORM_INF}\) }
+                {\frac{\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}}    }{\|\texttt{src2}\|_{L_{\infty}} }}{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_INF}\) }
+                \f]
+                */
+                NORM_INF       = 1,
+                /**
+                \f[
+                norm =  \forkthree
+                {\| \texttt{src1} \| _{L_1} =  \sum _I | \texttt{src1} (I)|}{if  \(\texttt{normType} = \texttt{NORM_L1}\)}
+                { \| \texttt{src1} - \texttt{src2} \| _{L_1} =  \sum _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  \(\texttt{normType} = \texttt{NORM_L1}\) }
+                { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_1} }{\|\texttt{src2}\|_{L_1}} }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L1}\) }
+                \f]*/
+                 NORM_L1        = 2,
+                 /**
+                 \f[
+                 norm =  \forkthree
+                 { \| \texttt{src1} \| _{L_2} =  \sqrt{\sum_I \texttt{src1}(I)^2} }{if  \(\texttt{normType} = \texttt{NORM_L2}\) }
+                 { \| \texttt{src1} - \texttt{src2} \| _{L_2} =  \sqrt{\sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2} }{if  \(\texttt{normType} = \texttt{NORM_L2}\) }
+                 { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}} }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2}\) }
+                 \f]
+                 */
+                 NORM_L2        = 4,
+                 /**
+                 \f[
+                 norm =  \forkthree
+                 { \| \texttt{src1} \| _{L_2} ^{2} = \sum_I \texttt{src1}(I)^2} {if  \(\texttt{normType} = \texttt{NORM_L2SQR}\)}
+                 { \| \texttt{src1} - \texttt{src2} \| _{L_2} ^{2} =  \sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2 }{if  \(\texttt{normType} = \texttt{NORM_L2SQR}\) }
+                 { \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2}\) }
+                 \f]
+                 */
+                 NORM_L2SQR     = 5,
+                 /**
+                 In the case of one input array, calculates the Hamming distance of the array from zero,
+                 In the case of two input arrays, calculates the Hamming distance between the arrays.
+                 */
+                 NORM_HAMMING   = 6,
+                 /**
+                 Similar to NORM_HAMMING, but in the calculation, each two bits of the input sequence will
+                 be added and treated as a single bit to be used in the same calculation as NORM_HAMMING.
+                 */
+                 NORM_HAMMING2  = 7,
+                 NORM_TYPE_MASK = 7, //!< bit-mask which can be used to separate norm type from norm flags
+                 NORM_RELATIVE  = 8, //!< flag
+                 NORM_MINMAX    = 32 //!< flag
+               };
+
+//! comparison types
+enum CmpTypes { CMP_EQ = 0, //!< src1 is equal to src2.
+                CMP_GT = 1, //!< src1 is greater than src2.
+                CMP_GE = 2, //!< src1 is greater than or equal to src2.
+                CMP_LT = 3, //!< src1 is less than src2.
+                CMP_LE = 4, //!< src1 is less than or equal to src2.
+                CMP_NE = 5  //!< src1 is unequal to src2.
+              };
+
+//! generalized matrix multiplication flags
+enum GemmFlags { GEMM_1_T = 1, //!< transposes src1
+                 GEMM_2_T = 2, //!< transposes src2
+                 GEMM_3_T = 4 //!< transposes src3
+               };
+
+enum DftFlags {
+    /** performs an inverse 1D or 2D transform instead of the default forward
+        transform. */
+    DFT_INVERSE        = 1,
+    /** scales the result: divide it by the number of array elements. Normally, it is
+        combined with DFT_INVERSE. */
+    DFT_SCALE          = 2,
+    /** performs a forward or inverse transform of every individual row of the input
+        matrix; this flag enables you to transform multiple vectors simultaneously and can be used to
+        decrease the overhead (which is sometimes several times larger than the processing itself) to
+        perform 3D and higher-dimensional transformations and so forth.*/
+    DFT_ROWS           = 4,
+    /** performs a forward transformation of 1D or 2D real array; the result,
+        though being a complex array, has complex-conjugate symmetry (*CCS*, see the function
+        description below for details), and such an array can be packed into a real array of the same
+        size as input, which is the fastest option and which is what the function does by default;
+        however, you may wish to get a full complex array (for simpler spectrum analysis, and so on) -
+        pass the flag to enable the function to produce a full-size complex output array. */
+    DFT_COMPLEX_OUTPUT = 16,
+    /** performs an inverse transformation of a 1D or 2D complex array; the
+        result is normally a complex array of the same size, however, if the input array has
+        conjugate-complex symmetry (for example, it is a result of forward transformation with
+        DFT_COMPLEX_OUTPUT flag), the output is a real array; while the function itself does not
+        check whether the input is symmetrical or not, you can pass the flag and then the function
+        will assume the symmetry and produce the real output array (note that when the input is packed
+        into a real array and inverse transformation is executed, the function treats the input as a
+        packed complex-conjugate symmetrical array, and the output will also be a real array). */
+    DFT_REAL_OUTPUT    = 32,
+    /** specifies that input is complex input. If this flag is set, the input must have 2 channels.
+        On the other hand, for backwards compatibility reason, if input has 2 channels, input is
+        already considered complex. */
+    DFT_COMPLEX_INPUT  = 64,
+    /** performs an inverse 1D or 2D transform instead of the default forward transform. */
+    DCT_INVERSE        = DFT_INVERSE,
+    /** performs a forward or inverse transform of every individual row of the input
+        matrix. This flag enables you to transform multiple vectors simultaneously and can be used to
+        decrease the overhead (which is sometimes several times larger than the processing itself) to
+        perform 3D and higher-dimensional transforms and so forth.*/
+    DCT_ROWS           = DFT_ROWS
+};
+
+//! Various border types, image boundaries are denoted with `|`
+//! @see borderInterpolate, copyMakeBorder
+enum BorderTypes {
+    BORDER_CONSTANT    = 0, //!< `iiiiii|abcdefgh|iiiiiii`  with some specified `i`
+    BORDER_REPLICATE   = 1, //!< `aaaaaa|abcdefgh|hhhhhhh`
+    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
+    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
+    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+
+    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+};
+
+//! @} core_array
+
+//! @addtogroup core_utils
+//! @{
+
+/*! @brief Signals an error and raises the exception.
+
+By default the function prints information about the error to stderr,
+then it either stops if setBreakOnError() had been called before or raises the exception.
+It is possible to alternate error processing by using redirectError().
+@param _code - error code (Error::Code)
+@param _err - error description
+@param _func - function name. Available only when the compiler supports getting it
+@param _file - source file name where the error has occurred
+@param _line - line number in the source file where the error has occurred
+@see CV_Error, CV_Error_, CV_Assert, CV_DbgAssert
+ */
+CV_EXPORTS void error(int _code, const String& _err, const char* _func, const char* _file, int _line);
+
+#ifdef __GNUC__
+# if defined __clang__ || defined __APPLE__
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Winvalid-noreturn"
+# endif
+#endif
+
+/** same as cv::error, but does not return */
+CV_INLINE CV_NORETURN void errorNoReturn(int _code, const String& _err, const char* _func, const char* _file, int _line)
+{
+    error(_code, _err, _func, _file, _line);
+#ifdef __GNUC__
+# if !defined __clang__ && !defined __APPLE__
+    // this suppresses this warning: "noreturn" function does return [enabled by default]
+    __builtin_trap();
+    // or use infinite loop: for (;;) {}
+# endif
+#endif
+}
+#ifdef __GNUC__
+# if defined __clang__ || defined __APPLE__
+#   pragma GCC diagnostic pop
+# endif
+#endif
+
+#ifdef CV_STATIC_ANALYSIS
+
+// In practice, some macro are not processed correctly (noreturn is not detected).
+// We need to use simplified definition for them.
+#define CV_Error(...) do { abort(); } while (0)
+#define CV_Error_( code, args ) do { cv::format args; abort(); } while (0)
+#define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0)
+#define CV_ErrorNoReturn CV_Error
+#define CV_ErrorNoReturn_ CV_Error_
+
+#else // CV_STATIC_ANALYSIS
+
+/** @brief Call the error handler.
+
+Currently, the error handler prints the error code and the error message to the standard
+error stream `stderr`. In the Debug configuration, it then provokes memory access violation, so that
+the execution stack and all the parameters can be analyzed by the debugger. In the Release
+configuration, the exception is thrown.
+
+@param code one of Error::Code
+@param msg error message
+*/
+#define CV_Error( code, msg ) cv::error( code, msg, CV_Func, __FILE__, __LINE__ )
+
+/**  @brief Call the error handler.
+
+This macro can be used to construct an error message on-fly to include some dynamic information,
+for example:
+@code
+    // note the extra parentheses around the formatted text message
+    CV_Error_(Error::StsOutOfRange,
+    ("the value at (%d, %d)=%g is out of range", badPt.x, badPt.y, badValue));
+@endcode
+@param code one of Error::Code
+@param args printf-like formatted error message in parentheses
+*/
+#define CV_Error_( code, args ) cv::error( code, cv::format args, CV_Func, __FILE__, __LINE__ )
+
+/** @brief Checks a condition at runtime and throws exception if it fails
+
+The macros CV_Assert (and CV_DbgAssert(expr)) evaluate the specified expression. If it is 0, the macros
+raise an error (see cv::error). The macro CV_Assert checks the condition in both Debug and Release
+configurations while CV_DbgAssert is only retained in the Debug configuration.
+*/
+#define CV_Assert( expr ) do { if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
+
+//! @cond IGNORED
+#define CV__ErrorNoReturn( code, msg ) cv::errorNoReturn( code, msg, CV_Func, __FILE__, __LINE__ )
+#define CV__ErrorNoReturn_( code, args ) cv::errorNoReturn( code, cv::format args, CV_Func, __FILE__, __LINE__ )
+#ifdef __OPENCV_BUILD
+#undef CV_Error
+#define CV_Error CV__ErrorNoReturn
+#undef CV_Error_
+#define CV_Error_ CV__ErrorNoReturn_
+#undef CV_Assert
+#define CV_Assert( expr ) do { if(!!(expr)) ; else cv::errorNoReturn( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
+#else
+// backward compatibility
+#define CV_ErrorNoReturn CV__ErrorNoReturn
+#define CV_ErrorNoReturn_ CV__ErrorNoReturn_
+#endif
+//! @endcond
+
+#endif // CV_STATIC_ANALYSIS
+
+//! @cond IGNORED
+
+#if defined OPENCV_FORCE_MULTIARG_ASSERT_CHECK && defined CV_STATIC_ANALYSIS
+#warning "OPENCV_FORCE_MULTIARG_ASSERT_CHECK can't be used with CV_STATIC_ANALYSIS"
+#undef OPENCV_FORCE_MULTIARG_ASSERT_CHECK
+#endif
+
+#ifdef OPENCV_FORCE_MULTIARG_ASSERT_CHECK
+#define CV_Assert_1( expr ) do { if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
+#else
+#define CV_Assert_1 CV_Assert
+#endif
+#define CV_Assert_2( expr1, expr2 ) CV_Assert_1(expr1); CV_Assert_1(expr2)
+#define CV_Assert_3( expr1, expr2, expr3 ) CV_Assert_2(expr1, expr2); CV_Assert_1(expr3)
+#define CV_Assert_4( expr1, expr2, expr3, expr4 ) CV_Assert_3(expr1, expr2, expr3); CV_Assert_1(expr4)
+#define CV_Assert_5( expr1, expr2, expr3, expr4, expr5 ) CV_Assert_4(expr1, expr2, expr3, expr4); CV_Assert_1(expr5)
+#define CV_Assert_6( expr1, expr2, expr3, expr4, expr5, expr6 ) CV_Assert_5(expr1, expr2, expr3, expr4, expr5); CV_Assert_1(expr6)
+#define CV_Assert_7( expr1, expr2, expr3, expr4, expr5, expr6, expr7 ) CV_Assert_6(expr1, expr2, expr3, expr4, expr5, expr6 ); CV_Assert_1(expr7)
+#define CV_Assert_8( expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8 ) CV_Assert_7(expr1, expr2, expr3, expr4, expr5, expr6, expr7 ); CV_Assert_1(expr8)
+#define CV_Assert_9( expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8, expr9 ) CV_Assert_8(expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8 ); CV_Assert_1(expr9)
+#define CV_Assert_10( expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8, expr9, expr10 ) CV_Assert_9(expr1, expr2, expr3, expr4, expr5, expr6, expr7, expr8, expr9 ); CV_Assert_1(expr10)
+
+#define CV_Assert_N(...) do { __CV_CAT(CV_Assert_, __CV_VA_NUM_ARGS(__VA_ARGS__)) (__VA_ARGS__); } while(0)
+
+#ifdef OPENCV_FORCE_MULTIARG_ASSERT_CHECK
+#undef CV_Assert
+#define CV_Assert CV_Assert_N
+#endif
+//! @endcond
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+#  define CV_DbgAssert(expr) CV_Assert(expr)
+#else
+/** replaced with CV_Assert(expr) in Debug configuration */
+#  define CV_DbgAssert(expr)
+#endif
+
+/*
+ * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ * bit count of A exclusive XOR'ed with B
+ */
+struct CV_EXPORTS Hamming
+{
+    enum { normType = NORM_HAMMING };
+    typedef unsigned char ValueType;
+    typedef int ResultType;
+
+    /** this will count the bits in a ^ b
+     */
+    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const;
+};
+
+typedef Hamming HammingLUT;
+
+/////////////////////////////////// inline norms ////////////////////////////////////
+
+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i=0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i];
+        s += v*v;
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i = 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
+    }
+#endif
+    for( ; i < n; i++ )
+        s += cv_abs(a[i]);
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i= 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = _AccTp(a[i] - b[i]), v1 = _AccTp(a[i+1] - b[i+1]), v2 = _AccTp(a[i+2] - b[i+2]), v3 = _AccTp(a[i+3] - b[i+3]);
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = _AccTp(a[i] - b[i]);
+        s += v*v;
+    }
+    return s;
+}
+
+static inline float normL2Sqr(const float* a, const float* b, int n)
+{
+    float s = 0.f;
+    for( int i = 0; i < n; i++ )
+    {
+        float v = a[i] - b[i];
+        s += v*v;
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i= 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = _AccTp(a[i] - b[i]), v1 = _AccTp(a[i+1] - b[i+1]), v2 = _AccTp(a[i+2] - b[i+2]), v3 = _AccTp(a[i+3] - b[i+3]);
+        s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = _AccTp(a[i] - b[i]);
+        s += std::abs(v);
+    }
+    return s;
+}
+
+inline float normL1(const float* a, const float* b, int n)
+{
+    float s = 0.f;
+    for( int i = 0; i < n; i++ )
+    {
+        s += std::abs(a[i] - b[i]);
+    }
+    return s;
+}
+
+inline int normL1(const uchar* a, const uchar* b, int n)
+{
+    int s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        s += std::abs(a[i] - b[i]);
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        _AccTp v0 = a[i] - b[i];
+        s = std::max(s, std::abs(v0));
+    }
+    return s;
+}
+
+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);
+
+/** proxy for hal::LU */
+CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+/** proxy for hal::LU */
+CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+/** proxy for hal::Cholesky */
+CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+/** proxy for hal::Cholesky */
+CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+////////////////// forward declarations for important OpenCV types //////////////////
+
+//! @cond IGNORED
+
+template<typename _Tp, int cn> class Vec;
+template<typename _Tp, int m, int n> class Matx;
+
+template<typename _Tp> class Complex;
+template<typename _Tp> class Point_;
+template<typename _Tp> class Point3_;
+template<typename _Tp> class Size_;
+template<typename _Tp> class Rect_;
+template<typename _Tp> class Scalar_;
+
+class CV_EXPORTS RotatedRect;
+class CV_EXPORTS Range;
+class CV_EXPORTS TermCriteria;
+class CV_EXPORTS KeyPoint;
+class CV_EXPORTS DMatch;
+class CV_EXPORTS RNG;
+
+class CV_EXPORTS Mat;
+class CV_EXPORTS MatExpr;
+
+class CV_EXPORTS UMat;
+
+class CV_EXPORTS SparseMat;
+typedef Mat MatND;
+
+template<typename _Tp> class Mat_;
+template<typename _Tp> class SparseMat_;
+
+class CV_EXPORTS MatConstIterator;
+class CV_EXPORTS SparseMatIterator;
+class CV_EXPORTS SparseMatConstIterator;
+template<typename _Tp> class MatIterator_;
+template<typename _Tp> class MatConstIterator_;
+template<typename _Tp> class SparseMatIterator_;
+template<typename _Tp> class SparseMatConstIterator_;
+
+namespace ogl
+{
+    class CV_EXPORTS Buffer;
+    class CV_EXPORTS Texture2D;
+    class CV_EXPORTS Arrays;
+}
+
+namespace cuda
+{
+    class CV_EXPORTS GpuMat;
+    class CV_EXPORTS HostMem;
+    class CV_EXPORTS Stream;
+    class CV_EXPORTS Event;
+}
+
+namespace cudev
+{
+    template <typename _Tp> class GpuMat_;
+}
+
+namespace ipp
+{
+#if OPENCV_ABI_COMPATIBILITY > 300
+CV_EXPORTS   unsigned long long getIppFeatures();
+#else
+CV_EXPORTS   int getIppFeatures();
+#endif
+CV_EXPORTS   void setIppStatus(int status, const char * const funcname = NULL, const char * const filename = NULL,
+                             int line = 0);
+CV_EXPORTS   int getIppStatus();
+CV_EXPORTS   String getIppErrorLocation();
+CV_EXPORTS_W bool   useIPP();
+CV_EXPORTS_W void   setUseIPP(bool flag);
+CV_EXPORTS_W String getIppVersion();
+
+// IPP Not-Exact mode. This function may force use of IPP then both IPP and OpenCV provide proper results
+// but have internal accuracy differences which have too much direct or indirect impact on accuracy tests.
+CV_EXPORTS_W bool useIPP_NotExact();
+CV_EXPORTS_W void setUseIPP_NotExact(bool flag);
+#if OPENCV_ABI_COMPATIBILITY < 400
+CV_EXPORTS_W bool useIPP_NE();
+CV_EXPORTS_W void setUseIPP_NE(bool flag);
+#endif
+
+} // ipp
+
+//! @endcond
+
+//! @} core_utils
+
+
+
+
+} // cv
+
+#include "opencv2/core/neon_utils.hpp"
+#include "opencv2/core/vsx_utils.hpp"
+#include "opencv2/core/check.hpp"
+
+#endif //OPENCV_CORE_BASE_HPP
--- a/samples/external/opencv/include/opencv2/core/bindings_utils.hpp
+++ b/samples/external/opencv/include/opencv2/core/bindings_utils.hpp
@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_BINDINGS_UTILS_HPP
+#define OPENCV_CORE_BINDINGS_UTILS_HPP
+
+namespace cv { namespace utils {
+//! @addtogroup core_utils
+//! @{
+
+CV_EXPORTS_W String dumpInputArray(InputArray argument);
+
+CV_EXPORTS_W String dumpInputArrayOfArrays(InputArrayOfArrays argument);
+
+CV_EXPORTS_W String dumpInputOutputArray(InputOutputArray argument);
+
+CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argument);
+
+//! @}
+}} // namespace
+
+#endif // OPENCV_CORE_BINDINGS_UTILS_HPP
--- a/samples/external/opencv/include/opencv2/core/bufferpool.hpp
+++ b/samples/external/opencv/include/opencv2/core/bufferpool.hpp
@ -0,0 +1,40 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#ifndef OPENCV_CORE_BUFFER_POOL_HPP
+#define OPENCV_CORE_BUFFER_POOL_HPP
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4265)
+#endif
+
+namespace cv
+{
+
+//! @addtogroup core
+//! @{
+
+class BufferPoolController
+{
+protected:
+    ~BufferPoolController() { }
+public:
+    virtual size_t getReservedSize() const = 0;
+    virtual size_t getMaxReservedSize() const = 0;
+    virtual void setMaxReservedSize(size_t size) = 0;
+    virtual void freeAllReservedBuffers() = 0;
+};
+
+//! @}
+
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // OPENCV_CORE_BUFFER_POOL_HPP
--- a/samples/external/opencv/include/opencv2/core/check.hpp
+++ b/samples/external/opencv/include/opencv2/core/check.hpp
@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CHECK_HPP
+#define OPENCV_CORE_CHECK_HPP
+
+#include <opencv2/core/base.hpp>
+
+namespace cv {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or "<invalid depth>" */
+CV_EXPORTS const char* depthToString(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
+CV_EXPORTS const String typeToString(int type);
+
+
+//! @cond IGNORED
+namespace detail {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or NULL */
+CV_EXPORTS const char* depthToString_(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
+CV_EXPORTS const cv::String typeToString_(int type);
+
+enum TestOp {
+  TEST_CUSTOM = 0,
+  TEST_EQ = 1,
+  TEST_NE = 2,
+  TEST_LE = 3,
+  TEST_LT = 4,
+  TEST_GE = 5,
+  TEST_GT = 6,
+  CV__LAST_TEST_OP
+};
+
+struct CheckContext {
+    const char* func;
+    const char* file;
+    int line;
+    enum TestOp testOp;
+    const char* message;
+    const char* p1_str;
+    const char* p2_str;
+};
+
+#ifndef CV__CHECK_FILENAME
+# define CV__CHECK_FILENAME __FILE__
+#endif
+
+#ifndef CV__CHECK_FUNCTION
+# if defined _MSC_VER
+#   define CV__CHECK_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__CHECK_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__CHECK_FUNCTION "<unknown>"
+# endif
+#endif
+
+#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
+#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
+    static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
+            { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, message, p1_str, p2_str }
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v1, const size_t v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
+
+
+#define CV__TEST_EQ(v1, v2) ((v1) == (v2))
+#define CV__TEST_NE(v1, v2) ((v1) != (v2))
+#define CV__TEST_LE(v1, v2) ((v1) <= (v2))
+#define CV__TEST_LT(v1, v2) ((v1) < (v2))
+#define CV__TEST_GE(v1, v2) ((v1) >= (v2))
+#define CV__TEST_GT(v1, v2) ((v1) > (v2))
+
+#define CV__CHECK(id, op, type, v1, v2, v1_str, v2_str, msg_str) do { \
+    if(CV__TEST_##op((v1), (v2))) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_ ## op, v1_str, v2_str); \
+        cv::detail::check_failed_ ## type((v1), (v2), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+#define CV__CHECK_CUSTOM_TEST(id, type, v, test_expr, v_str, test_expr_str, msg_str) do { \
+    if(!!(test_expr)) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_CUSTOM, v_str, test_expr_str); \
+        cv::detail::check_failed_ ## type((v), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+} // namespace
+//! @endcond
+
+
+/// Supported values of these types: int, float, double
+#define CV_CheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+
+/// Check with additional "decoding" of type values in error message
+#define CV_CheckTypeEQ(t1, t2, msg)  CV__CHECK(_, EQ, MatType, t1, t2, #t1, #t2, msg)
+/// Check with additional "decoding" of depth values in error message
+#define CV_CheckDepthEQ(d1, d2, msg)  CV__CHECK(_, EQ, MatDepth, d1, d2, #d1, #d2, msg)
+
+#define CV_CheckChannelsEQ(c1, c2, msg)  CV__CHECK(_, EQ, MatChannels, c1, c2, #c1, #c2, msg)
+
+/// Example: type == CV_8UC1 || type == CV_8UC3
+#define CV_CheckType(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatType, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: depth == CV_32F || depth == CV_64F
+#define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: v == A || v == B
+#define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+
+/// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
+// TODO define pretty-printers
+
+#ifndef NDEBUG
+#define CV_DbgCheck(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+#define CV_DbgCheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+#else
+#define CV_DbgCheck(v, test_expr, msg)  do { } while (0)
+#define CV_DbgCheckEQ(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckNE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLT(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGT(v1, v2, msg)  do { } while (0)
+#endif
+
+} // namespace
+
+#endif // OPENCV_CORE_CHECK_HPP
--- a/samples/external/opencv/include/opencv2/core/core.hpp
+++ b/samples/external/opencv/include/opencv2/core/core.hpp
@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/core.hpp"
--- a/samples/external/opencv/include/opencv2/core/core_c.h
+++ b/samples/external/opencv/include/opencv2/core/core_c.h
--- a/samples/external/opencv/include/opencv2/core/cuda.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda.hpp
--- a/samples/external/opencv/include/opencv2/core/cuda.inl.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda.inl.hpp
@ -0,0 +1,631 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDAINL_HPP
+#define OPENCV_CORE_CUDAINL_HPP
+
+#include "opencv2/core/cuda.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda {
+
+//===================================================================================
+// GpuMat
+//===================================================================================
+
+inline
+GpuMat::GpuMat(Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+    {
+        create(rows_, cols_, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Scalar s_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+    {
+        create(size_.height, size_.width, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(const GpuMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+inline
+GpuMat::GpuMat(InputArray arr, Allocator* allocator_) :
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    upload(arr);
+}
+
+inline
+GpuMat::~GpuMat()
+{
+    release();
+}
+
+inline
+GpuMat& GpuMat::operator =(const GpuMat& m)
+{
+    if (this != &m)
+    {
+        GpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void GpuMat::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+void GpuMat::swap(GpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(allocator, b.allocator);
+}
+
+inline
+GpuMat GpuMat::clone() const
+{
+    GpuMat m;
+    copyTo(m);
+    return m;
+}
+
+inline
+void GpuMat::copyTo(OutputArray dst, InputArray mask) const
+{
+    copyTo(dst, mask, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s)
+{
+    return setTo(s, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s, InputArray mask)
+{
+    return setTo(s, mask, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype) const
+{
+    convertTo(dst, rtype, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, double beta) const
+{
+    convertTo(dst, rtype, alpha, beta, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const
+{
+    convertTo(dst, rtype, alpha, 0.0, stream);
+}
+
+inline
+void GpuMat::assignTo(GpuMat& m, int _type) const
+{
+    if (_type < 0)
+        m = *this;
+    else
+        convertTo(m, _type);
+}
+
+inline
+uchar* GpuMat::ptr(int y)
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+inline
+const uchar* GpuMat::ptr(int y) const
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+template<typename _Tp> inline
+_Tp* GpuMat::ptr(int y)
+{
+    return (_Tp*)ptr(y);
+}
+
+template<typename _Tp> inline
+const _Tp* GpuMat::ptr(int y) const
+{
+    return (const _Tp*)ptr(y);
+}
+
+template <class T> inline
+GpuMat::operator PtrStepSz<T>() const
+{
+    return PtrStepSz<T>(rows, cols, (T*)data, step);
+}
+
+template <class T> inline
+GpuMat::operator PtrStep<T>() const
+{
+    return PtrStep<T>((T*)data, step);
+}
+
+inline
+GpuMat GpuMat::row(int y) const
+{
+    return GpuMat(*this, Range(y, y+1), Range::all());
+}
+
+inline
+GpuMat GpuMat::col(int x) const
+{
+    return GpuMat(*this, Range::all(), Range(x, x+1));
+}
+
+inline
+GpuMat GpuMat::rowRange(int startrow, int endrow) const
+{
+    return GpuMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+GpuMat GpuMat::rowRange(Range r) const
+{
+    return GpuMat(*this, r, Range::all());
+}
+
+inline
+GpuMat GpuMat::colRange(int startcol, int endcol) const
+{
+    return GpuMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+GpuMat GpuMat::colRange(Range r) const
+{
+    return GpuMat(*this, Range::all(), r);
+}
+
+inline
+GpuMat GpuMat::operator ()(Range rowRange_, Range colRange_) const
+{
+    return GpuMat(*this, rowRange_, colRange_);
+}
+
+inline
+GpuMat GpuMat::operator ()(Rect roi) const
+{
+    return GpuMat(*this, roi);
+}
+
+inline
+bool GpuMat::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t GpuMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int GpuMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int GpuMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int GpuMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t GpuMat::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size GpuMat::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool GpuMat::empty() const
+{
+    return data == 0;
+}
+
+static inline
+GpuMat createContinuous(int rows, int cols, int type)
+{
+    GpuMat m;
+    createContinuous(rows, cols, type, m);
+    return m;
+}
+
+static inline
+void createContinuous(Size size, int type, OutputArray arr)
+{
+    createContinuous(size.height, size.width, type, arr);
+}
+
+static inline
+GpuMat createContinuous(Size size, int type)
+{
+    GpuMat m;
+    createContinuous(size, type, m);
+    return m;
+}
+
+static inline
+void ensureSizeIsEnough(Size size, int type, OutputArray arr)
+{
+    ensureSizeIsEnough(size.height, size.width, type, arr);
+}
+
+static inline
+void swap(GpuMat& a, GpuMat& b)
+{
+    a.swap(b);
+}
+
+//===================================================================================
+// HostMem
+//===================================================================================
+
+inline
+HostMem::HostMem(AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+}
+
+inline
+HostMem::HostMem(const HostMem& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
+{
+    if( refcount )
+        CV_XADD(refcount, 1);
+}
+
+inline
+HostMem::HostMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+HostMem::HostMem(Size size_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+HostMem::HostMem(InputArray arr, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    arr.getMat().copyTo(*this);
+}
+
+inline
+HostMem::~HostMem()
+{
+    release();
+}
+
+inline
+HostMem& HostMem::operator =(const HostMem& m)
+{
+    if (this != &m)
+    {
+        HostMem temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void HostMem::swap(HostMem& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(alloc_type, b.alloc_type);
+}
+
+inline
+HostMem HostMem::clone() const
+{
+    HostMem m(size(), type(), alloc_type);
+    createMatHeader().copyTo(m);
+    return m;
+}
+
+inline
+void HostMem::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+Mat HostMem::createMatHeader() const
+{
+    return Mat(size(), type(), data, step);
+}
+
+inline
+bool HostMem::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t HostMem::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t HostMem::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int HostMem::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int HostMem::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int HostMem::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t HostMem::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size HostMem::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool HostMem::empty() const
+{
+    return data == 0;
+}
+
+static inline
+void swap(HostMem& a, HostMem& b)
+{
+    a.swap(b);
+}
+
+//===================================================================================
+// Stream
+//===================================================================================
+
+inline
+Stream::Stream(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//===================================================================================
+// Event
+//===================================================================================
+
+inline
+Event::Event(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+inline
+bool TargetArchs::has(int major, int minor)
+{
+    return hasPtx(major, minor) || hasBin(major, minor);
+}
+
+inline
+bool TargetArchs::hasEqualOrGreater(int major, int minor)
+{
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+}
+
+inline
+DeviceInfo::DeviceInfo()
+{
+    device_id_ = getDevice();
+}
+
+inline
+DeviceInfo::DeviceInfo(int device_id)
+{
+    CV_Assert( device_id >= 0 && device_id < getCudaEnabledDeviceCount() );
+    device_id_ = device_id;
+}
+
+inline
+int DeviceInfo::deviceID() const
+{
+    return device_id_;
+}
+
+inline
+size_t DeviceInfo::freeMemory() const
+{
+    size_t _totalMemory = 0, _freeMemory = 0;
+    queryMemory(_totalMemory, _freeMemory);
+    return _freeMemory;
+}
+
+inline
+size_t DeviceInfo::totalMemory() const
+{
+    size_t _totalMemory = 0, _freeMemory = 0;
+    queryMemory(_totalMemory, _freeMemory);
+    return _totalMemory;
+}
+
+inline
+bool DeviceInfo::supports(FeatureSet feature_set) const
+{
+    int version = majorVersion() * 10 + minorVersion();
+    return version >= feature_set;
+}
+
+
+}} // namespace cv { namespace cuda {
+
+//===================================================================================
+// Mat
+//===================================================================================
+
+namespace cv {
+
+inline
+Mat::Mat(const cuda::GpuMat& m)
+    : flags(0), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows)
+{
+    m.download(*this);
+}
+
+}
+
+//! @endcond
+
+#endif // OPENCV_CORE_CUDAINL_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/block.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/block.hpp
@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
+#define OPENCV_CUDA_DEVICE_BLOCK_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Block
+    {
+        static __device__ __forceinline__ unsigned int id()
+        {
+            return blockIdx.x;
+        }
+
+        static __device__ __forceinline__ unsigned int stride()
+        {
+            return blockDim.x * blockDim.y * blockDim.z;
+        }
+
+        static __device__ __forceinline__ void sync()
+        {
+            __syncthreads();
+        }
+
+        static __device__ __forceinline__ int flattenedThreadId()
+        {
+            return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            int STRIDE = stride();
+            It t = beg + flattenedThreadId();
+
+            for(; t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            int STRIDE = stride();
+            int tid = flattenedThreadId();
+            value += tid;
+
+            for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = *t;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = op(*t);
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            int STRIDE = stride();
+            InIt1 t1 = beg1 + flattenedThreadId();
+            InIt2 t2 = beg2 + flattenedThreadId();
+            OutIt o  = out + (t1 - beg1);
+
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
+                *o = op(*t1, *t2);
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid];
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid] = init;
+            __syncthreads();
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+            __syncthreads();
+            return buffer[0];
+        }
+
+        template <typename T, class BinOp>
+        static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
+        {
+            int ftid = flattenedThreadId();
+            int sft = stride();
+
+            if (sft < n)
+            {
+                for (unsigned int i = sft + ftid; i < n; i += sft)
+                    data[ftid] = op(data[ftid], data[i]);
+
+                __syncthreads();
+
+                n = sft;
+            }
+
+            while (n > 1)
+            {
+                unsigned int half = n/2;
+
+                if (ftid < half)
+                    data[ftid] = op(data[ftid], data[n - ftid - 1]);
+
+                __syncthreads();
+
+                n = n - half;
+            }
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_BLOCK_HPP */
--- a/samples/external/opencv/include/opencv2/core/cuda/border_interpolate.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/border_interpolate.hpp
@ -0,0 +1,722 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+#define OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    //////////////////////////////////////////////////////////////
+    // BrdConstant
+
+    template <typename D> struct BrdRowConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return x >= 0 ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return x < width ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
+        }
+
+        int width;
+        D val;
+    };
+
+    template <typename D> struct BrdColConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        int height;
+        D val;
+    };
+
+    template <typename D> struct BrdConstant
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_)
+        {
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        int height;
+        int width;
+        D val;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReplicate
+
+    template <typename D> struct BrdRowReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReplicate
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect101
+
+    template <typename D> struct BrdRowReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect101
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect
+
+    template <typename D> struct BrdRowReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(::abs(x) - (x < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(::abs(y) - (y < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (last_col - ::abs(last_col - x) + (x > last_col));
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdWrap
+
+    template <typename D> struct BrdRowWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int width;
+    };
+
+    template <typename D> struct BrdColWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int height;
+    };
+
+    template <typename D> struct BrdWrap
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) :
+            height(height_), width(width_)
+        {
+        }
+        template <typename U>
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) :
+            height(height_), width(width_)
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) ? y : (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) ? y : (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) ? x : (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) ? x : (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int height;
+        int width;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BorderReader
+
+    template <typename Ptr2D, typename B> struct BorderReader
+    {
+        typedef typename B::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
+
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
+        {
+            return b.at(y, x, ptr);
+        }
+
+        Ptr2D ptr;
+        B b;
+    };
+
+    // under win32 there is some bug with templated types that passed as kernel parameters
+    // with this specialization all works fine
+    template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
+    {
+        typedef typename BrdConstant<D>::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) :
+            src(src_), height(b.height), width(b.width), val(b.val)
+        {
+        }
+
+        __device__ __forceinline__ D operator ()(index_type y, index_type x) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        Ptr2D src;
+        int height;
+        int width;
+        D val;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_BORDER_INTERPOLATE_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/color.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/color.hpp
@ -0,0 +1,309 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COLOR_HPP
+#define OPENCV_CUDA_COLOR_HPP
+
+#include "detail/color_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // All OPENCV_CUDA_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COLOR_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/common.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/common.hpp
@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COMMON_HPP
+#define OPENCV_CUDA_COMMON_HPP
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda_types.hpp"
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+#ifndef CV_PI_F
+    #ifndef CV_PI
+        #define CV_PI_F 3.14159265f
+    #else
+        #define CV_PI_F ((float)CV_PI)
+    #endif
+#endif
+
+namespace cv { namespace cuda {
+    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
+    {
+        if (cudaSuccess != err)
+            cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+    }
+}}
+
+#ifndef cudaSafeCall
+    #define cudaSafeCall(expr)  cv::cuda::checkCudaError(expr, __FILE__, __LINE__, CV_Func)
+#endif
+
+namespace cv { namespace cuda
+{
+    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+    {
+        return reinterpret_cast<size_t>(ptr) % size == 0;
+    }
+
+    static inline bool isAligned(size_t step, size_t size)
+    {
+        return step % size == 0;
+    }
+}}
+
+namespace cv { namespace cuda
+{
+    namespace device
+    {
+        __host__ __device__ __forceinline__ int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+        }
+    }
+}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COMMON_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/datamov_utils.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/datamov_utils.hpp
@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DATAMOV_UTILS_HPP
+#define OPENCV_CUDA_DATAMOV_UTILS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
+
+        // for Fermi memory space is detected automatically
+        template <typename T> struct ForceGlob
+        {
+            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
+        };
+
+    #else // __CUDA_ARCH__ >= 200
+
+        #if defined(_WIN64) || defined(__LP64__)
+            // 64-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "l"
+        #else
+            // 32-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "r"
+        #endif
+
+        template<class T> struct ForceGlob;
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (double, f64, d)
+
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_CUDA_ASM_PTR
+
+    #endif // __CUDA_ARCH__ >= 200
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DATAMOV_UTILS_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/detail/color_detail.hpp
--- a/samples/external/opencv/include/opencv2/core/cuda/detail/reduce.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/detail/reduce.hpp
@ -0,0 +1,365 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                CV_UNUSED(smem);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@ -0,0 +1,502 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                CV_UNUSED(skeys);
+                CV_UNUSED(svals);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/detail/transform_detail.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/detail/transform_detail.hpp
@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_detail
+    {
+        //! Read Write Traits
+
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
+            const Mask mask, const BinOp op)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
+            const Mask mask, const BinOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
+                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+    } // namespace transform_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_DETAIL_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/detail/type_traits_detail.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@ -0,0 +1,191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits
+        {
+            enum { value = false };
+            typedef U type;
+        };
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+
+#include "../datamov_utils.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace vec_distance_detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
--- a/samples/external/opencv/include/opencv2/core/cuda/dynamic_smem.hpp
+++ b/samples/external/opencv/include/opencv2/core/cuda/dynamic_smem.hpp
@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DYNAMIC_SMEM_HPP
+#define OPENCV_CUDA_DYNAMIC_SMEM_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<class T> struct DynamicSharedMem
+    {
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DYNAMIC_SMEM_HPP
--- a/Show More
+++ b/Show More