Refining Interference Graph Scheduling - Deterministic Scheduling

Chapter 7 Deterministic Scheduling

7.2 Refining Interference Graph Scheduling

// Yerman Avila

// Maestría en Ingeniería - Automatización Industrial

// GoldenModel y Comparador para verificación de Hardware usando // C++ y CUDA C #define N (2048*2048) //#define N 512 #define THREADS_PER_BLOCK 512 #include <stdio.h> #include <stdlib.h>

// Kernel para sumar dos vectores, se ejecuta en GPU

__global__ void add_gpu( int *a, int *b, int

*c, int n){

int index = threadIdx.x + blockIdx.x *

blockDim.x;

if (index < n)

c[index] = a[index] + b[index]; }

// Kernel para comparar dos vectores, se ejecuta en GPU

__global__ void compare_gpu( int *a, int *b,

int *c, int n){

int index = threadIdx.x + blockIdx.x *

blockDim.x;

if (index < n){

c[index] = a[index] - b[index]; }

}

// Función para sumar dos vectores en la CPU void add_cpu (int *a, int *b, int *c, int n) {

for (int i=0; i < n; i++)

c[i] = a[i] + b[i]; }

// Función para comparar dos vectores en la CPU int compare_ints( int *a, int *b, int n ){

int pass = 0;

for (int i = 0; i < N; i++){

if (a[i] != b[i]) {

printf("Valor diferente en %d, valor_1: %d valor_2: %d\n",i, a[i], b[i]);

pass = 1;

}

if (pass == 0) printf ("Test CPU Correcto!\n"); else printf ("El test de comparación CPU ha fallado.\n");

return pass; }

// Función para comparar dos vectores en la GPU int validate( int *a, int n ){

int pass = 0;

for (int i = 0; i < N; i++){

if (a[i] != 0) {

printf("Valor diferente de 0 en i= %d, valor_1= %d \n",i, a[i]);

pass = 1;

}

if (pass == 0) printf ("Test GPU Correcto!\n"); else printf ("El test de comparación GPU ha fallado.\n");

return pass; } /* files: * f1=stimuli.txt * f2=output_adderVerilog.txt */ FILE *f1, *f2;

int main(int argc, char *argv[]){

cudaSetDevice(argc);

clock_t start = clock();

int size = N * sizeof( int );

//tamaño para reserva de memoria //Variables a usar CPU:

int *a; int *b; int *c_verilog; int *c_h; int *c_gpu; int *comp; int *comp2;

//Variables a usar GPU: int *dev_a; int *dev_b; int *dev_cgpu; int *dev_ch; int *dev_cver; int *dev_compare; int *dev_compare2;

//Reserva de memoria HOST

a = (int*)malloc( size );

b = (int*)malloc( size );

c_h=(int*)malloc( size );

c_gpu=(int*)malloc( size );

comp=(int*)malloc( size );

comp2=(int*)malloc( size );

//Reserva de memoria GPU

cudaMalloc( (void**)&dev_a, size );

cudaMalloc( (void**)&dev_b, size );

cudaMalloc( (void**)&dev_cgpu, size );

cudaMalloc( (void**)&dev_ch, size );

cudaMalloc( (void**)&dev_cver, size );

cudaMalloc( (void**)&dev_compare, size );

cudaMalloc( (void**)&dev_compare2, size );

clock_t start_ver_host = clock();

// lectura de vectores de entrada a[i] y b[i] desde stimuli.txt

f1=fopen("stimuli.txt","r");

for(int i=0; i<N; i++){

fscanf(f1,"%d %d\n", &a[i], &b[i]);

}

fclose(f1);

//Lectura de vector de salida c_verilog[i] desde output_adderVerilog.txt

f2=fopen("output_adderVerilog.txt","r");

for(int i=0; i<N; i++){

fscanf(f2,"%d\n", &c_verilog[i]);

}

fclose(f2);

//Golden Model Host: suma de dos vectores en c++

add_cpu(a, b, c_h, N);

// Comparador en CPU: compara salida verilog c_verilog con salida GM C

compare_ints(c_verilog, c_h, N);

printf("\n Tiempo total transcurrido verificación del DUT con GM en C: %f \n", ((double)clock() - start_ver_host) /

CLOCKS_PER_SEC);

clock_t start_ver_GPU = clock();

// copia de a y b a GPU mem

cudaMemcpy( dev_a, a, size,

cudaMemcpyHostToDevice );

cudaMemcpy( dev_b, b, size,

cudaMemcpyHostToDevice );

// kernel de suma de vectores usando blocks and threads

add_gpu<<< N/THREADS_PER_BLOCK,

THREADS_PER_BLOCK >>>( dev_a, dev_b, dev_cgpu,

N );

// copia de c desde GPU mem

cudaMemcpy( c_gpu, dev_cgpu, size,

cudaMemcpyDeviceToHost );

// Comparador en CPU: compara salida verilog c_verilog con salida GM C

compare_ints(c_verilog, c_gpu, N);

printf("\n Tiempo total transcurrido verificación del DUT con GM en CUDA: %f \n", ((double)clock() - start_ver_GPU) /

CLOCKS_PER_SEC);

clock_t start_ver_cuda= clock();

// Comparador en GPU: compara salida verilog c_verilog con salida GM C

cudaMemcpy( dev_ch, c_h, size,

cudaMemcpyHostToDevice );

cudaMemcpy( dev_cver, c_verilog, size,

cudaMemcpyHostToDevice );

cudaMemcpy( c_gpu, dev_cgpu, size,

cudaMemcpyDeviceToHost );

compare_gpu<<< N/THREADS_PER_BLOCK,

THREADS_PER_BLOCK >>>( dev_cver, dev_ch,

dev_compare, N);

cudaMemcpy( comp, dev_compare, size,

cudaMemcpyDeviceToHost );

validate(comp,N);

printf("\n Tiempo total transcurrido verificación del DUT con GM en C y comp CUDA: %f \n", ((double)clock() - start_ver_cuda) /

CLOCKS_PER_SEC);

clock_t start_GMCuda2_ver = clock();

// Comparador en GPU: compara salida verilog c_verilog con salida GMCUDA

cudaMemcpy( dev_ch, c_h, size,

cudaMemcpyHostToDevice );

cudaMemcpy( dev_cver, c_verilog, size,

cudaMemcpyHostToDevice );

cudaMemcpy( c_gpu, dev_cgpu, size,

cudaMemcpyDeviceToHost );

compare_gpu<<< N/THREADS_PER_BLOCK,

THREADS_PER_BLOCK >>>( dev_cver, dev_cgpu,

dev_compare2, N);

cudaMemcpy( comp2, dev_compare2, size,

cudaMemcpyDeviceToHost );

validate(comp2,N);

printf("\n Tiempo total transcurrido verificación del DUT con GM en CUDA y comp CUDA: %f \n", ((double)clock() -

start_GMCuda2_ver) / CLOCKS_PER_SEC);

// Cleanup CPU RAM

free(a); free(b); free(c_verilog); free(c_h); free(c_gpu); free(comp); free(comp2);

// Cleanup GPU DRAM

cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_cgpu); cudaFree(dev_ch); cudaFree(dev_cver); cudaFree(dev_compare); cudaFree(dev_compare2);

printf("\n Tiempo total verificación

transcurrido: %f \n", ((double)clock() - start) / CLOCKS_PER_SEC);

printf("\n Cantidad de datos tipo entero en cada vector %d \n", N);

return 0; }

Bibliografía

[1] J. L. Hennessy and D. A. Patterson, Computer Architecture: A quantitative approach, 5Th ed. Waltham, MA, 2012.

[2] D. A. Patterson and J. L. Hennessy, Computer Organization and Design: The

hardware/software interface, 4Th ed. Waltham, MA: MK, 2012.

[3] J. Nakano, “Parallel computing techniques,” 2004.

[4] Z. Chen, D. Kaeli, and N. Rubin, “Characterizing scalar opportunities in GPGPU

applications,” ISPASS 2013 - IEEE Int. Symp. Perform. Anal. Syst. Softw., pp. 225–234, 2013.

[5] D. A. Yuen, L. Wang, X. Chi, L. Johnsson, W. Ge, and Y. Shi, GPU Solutions to multi-scale

problems in Science and Engineering. 2013.

[6] T. S. Crow, “Evolution of the Graphical Processing Unit,” no. December 2004.

[7] S. Al-Kiswany, A. Gharaibeh, and M. Ripeanu, “GPUs as storage system accelerators,”

IEEE Trans. Parallel Distrib. Syst., vol. 24, no. 8, pp. 1556–1566, 2013.

[8] R. M. Amorim and R. Weber dos Santos, “Solving the cardiac bidomain equations using graphics processing units,” J. Comput. Sci., vol. 4, no. 5, pp. 370–376, 2013.

[9] T. S. Crow, “Evolution of the graphical processing unit,” no. December 2004, 2004.

[10] A. S. Arefin, C. Riveros, R. Berretta, and P. Moscato, “GPU-FS-kNN: a software tool for fast and scalable kNN computation using GPUs.,” PLoS One, vol. 7, no. 8, p. e44000, Jan. 2012.

[11] D. Aracena-Pizarro and N. Daneri-Alvarado, “Detección de puntos claves mediante SIFT paralelizado en GPU,” Ingeniare. Rev. Chil. Ing., vol. 21, no. 3, pp. 438–447, 2013. [12] D. Defour and M. Marin, “FuzzyGPU: A Fuzzy Arithmetic Library for GPU,” 2014 22nd

Euromicro Int. Conf. Parallel, Distrib. Network-Based Process., pp. 624–631, Feb. 2014. [13] V. Beddo, “Applications of Parallel Programming in Statistics,” University of California, Los

Angeles, 2002.

[14] P. Benner, P. Ezzatti, H. Mena, E. Quintana-Ortí, and A. Remón, “Solving Matrix Equations on Multi-Core and Many-Core Architectures,” Algorithms, vol. 6, no. 4, pp. 857–870, 2013. [15] Y. Zhang, Y. H. Shalabi, R. Jain, K. K. Nagar, and J. D. Bakos, “FPGA vs. GPU for sparse

matrix vector multiply,” Proc. 2009 Int. Conf. Field-Programmable Technol. FPT’09, pp. 255–262, 2009.

[16] T. Cheng, “Accelerating universal Kriging interpolation algorithm using CUDA-enabled GPU,” Comput. Geosci., vol. 54, pp. 178–183, 2013.

[17] M. Biazewicz, K. Kurowski, B. Ludwiczak, K. Napieraia, T. E. Simos, G. Psihoyios, and C. Tsitouras, “Problems Related to Parallelization of CFD Algorithms on GPU, Multi-GPU and Hybrid Architectures,” pp. 1301–1304, 2010.

[18] Y. Zhao, Q. Qiu, J. Fang, and L. Li, “Fast parallel interpolation algorithm using cuda,” pp. 3662–3665, 2013.

[19] M. N. Velev, “Efficient Parallel GPU Algorithms for BDD Manipulation *,” pp. 750–755, 2014.

[20] T. T. Zygiridis, “High-Order Error-Optimized FDTD Algorithm With GPU Implementation,”

IEEE Trans. Magn., vol. 49, no. 5, pp. 1809–1812, 2013.

[21] G. Beliakov, M. Johnstone, D. Creighton, and T. Wilkin, “An efficient implementation of Bailey and Borwein’s algorithm for parallel random number generation on graphics processing units,” Computing, vol. 95, no. 4, pp. 309–326, 2012.

[22] S. Ghetia, N. Gajjar, and R. Gajjar, “Implementation of 2-D Discrete Cosine Transform Algorithm on GPU,” vol. 2, no. 7, pp. 3024–3030, 2013.

[23] O. Maitre, N. Lachiche, P. Clauss, L. Baumes, A. Corma, and P. Collet, “Efficient parallel implementation of evolutionary algorithms on GPGPU cards,” Lect. Notes Comput. Sci.

(including Subser. Lect. Notes Artif. Intell. Lect. Notes Bioinformatics), vol. 5704 LNCS, pp.

974–985, 2009.

[24] M. Bailey, “Using GPU shaders for visualization, Part 3,” IEEE Comput. Graph. Appl., vol. 33, no. 3, pp. 5–11, 2013.

[25] D. Weiskopf, GPU-Based Interactive Visualization Techniques. Berlin: Springer, 2007. [26] V. Galiano, O. López, M. P. Malumbres, and H. Migallón, “Parallel strategies for 2D

Discrete Wavelet Transform in shared memory systems and GPUs,” J. Supercomput., vol. 64, no. 1, pp. 4–16, 2012.

[27] M. Arora, “The Architecture and Evolution of CPU-GPU Systems for General Purpose Computing.”

[28] E. Kandrot and J. Sanders, CUDA by Example, vol. 21. 2011.

[29] W. Nvidia, N. Generation, and C. Compute, “Whitepaper NVIDIA’s Next Generation CUDA Compute Architecture,” ReVision, vol. 23, no. 6, pp. 1–22, 2009.

[30] J. Nakano, “Handbook of Computational Statistics,” pp. 243–271, 2012.

[31] Y. Deng, Applied Parallel Computing, vol. XXXIII, no. 2. Singapore: World Scientific Publishing Co. Pte. Ltd., 2013.

[32] DakarTeam, “SHOC : The Scalable HeterOgeneous Computing Benchmark Suite,”

Building, no. November, pp. 1–8, 2011.

[33] D. Culler and J. P. Singh, “Parallel Computer Architecture,” 1997. [34] J. Nakano, “Parallel Computing Techniques,” pp. 243–271, 2012.

[35] J. Nickolls and W. J. Dally, “The GPU Computing Era,” IEEE Micro, pp. 56–70, 2010. [36] OfficeOfScienceAndTechnologyPolicy, “A Research and Development Strategy for High

Performance Computing,” Sci. Technol., 1987.

[37] C.-Y. Chou, Y. Dong, Y. Hung, Y.-J. Kao, W. Wang, C.-M. Kao, and C.-T. Chen, “Accelerating image reconstruction in dual-head PET system by GPU and symmetry properties.,” PLoS One, vol. 7, no. 12, p. e50540, Jan. 2012.

[38] NVIDIA, “Cuda Education & Training,” 2015. [Online]. Available: https://developer.nvidia.com/cuda-education-training.

[39] C. Mcclanahan, “History and Evolution of GPU Architecture,” pp. 1–7, 2010.

[40] J. J. Durillo, “Programming GPUs Lecture 1 GPU Programs and Introduction to OpenCL ( I ) Section 1 Executing Programs in GPU.” .

[41] D. Luebke and G. Humphreys, “How GPUs Work,” 2007.

[42] B. T. Phong, “Illumination for Computer-Generated Images,” Commun. ACM, vol. 18, no. 6, pp. 311–317, 1975.

[43] Nvidia, “GeForce 8800 GPU Architecture - Technical Brief,” Architecture, no. November, 2006.

[44] NVIDIA, “Kepler GK110 Overview,” Overview, 2014.

[45] F. F. Maxwell and G. P. U. Technology, “NVIDIA GeForce GTX 750 Ti,” pp. 1–11, 2014. [46] NVIDIA, “CUDA C Programming Guide,” no. February, p. 240, 2015.

[47] C. Cuda, “Best Practices Guide,” Nvidia Corp., no. DG-05603–001_v6.0, 2014.

[48] NVIDIA, “CUDA ZONE,” 2015. [Online]. Available: https://developer.nvidia.com/cuda-zone. [49] AMD, “Developer AMD,” 2015. .

[50] Nvidia, “NVIDIA Tesla C2075 Companion Processor,” Prod. Br., 2011.

[51] NVIDIA, “Quadro 600.” [Online]. Available: http://la.nvidia.com/object/product-quadro-600- la.html.

[52] NVIDIA, “GeForce GT 520M.” [Online]. Available: http://la.nvidia.com/object/product- geforce-gt-520m-la.html.

[53] Nvidia_CUDA_Toolkit, “Nvidia cuda toolkit v6.5,” no. November, 2013.

[54] D. C. Black, J. Donovan, B. Bunton, and A. Keist, System C From de Grown Up. New York: Springer, 2010.

[55] Y. Avila, “Uso de herramientas libres para la Verificación funcional por simulación de sistemas digitales,” Universidad Nacional de Colombia, 2012.

In document Galois : a system for parallel execution of irregular algorithms (Page 145-149)