sppu-practical
Abhijeet Gavali
Posted on April 23, 2024
cuda mat mul
#include <cuda_runtime.h>
#include <iostream>
__global__ void matmul(int* A, int* B, int* C, int N) {
int Row = blockIdx.y*blockDim.y+threadIdx.y;
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if (Row < N && Col < N) {
int Pvalue = 0;
for (int k = 0; k < N; k++) {
Pvalue += A[Row*N+k] * B[k*N+Col];
}
C[Row*N+Col] = Pvalue;
}
}
int main() {
int N = 512;
int size = N * N * sizeof(int);
int* A, * B, * C;
int* dev_A, * dev_B, * dev_C;
cudaMallocHost(&A, size);
cudaMallocHost(&B, size);
cudaMallocHost(&C, size);
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);
// Initialize matrices A and B
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i*N+j] = i*N+j;
B[i*N+j] = j*N+i;
}
}
cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
dim3 dimBlock(16, 16);
dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);
matmul<<<dimGrid, dimBlock>>>(dev_A, dev_B, dev_C, N);
cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
// Print the result
for (int i = 0; i < 10; i++) {
for (int j = 0; j < 10; j++) {
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
// Free memory
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);
return 0;
}
cuda vec add
// program 2
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
__global__ void addVectors(int* A, int* B, int* C, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[I];
}
}
int main()
{
int n = 1000000;
int* A, * B, * C;
int size = n * sizeof(int);
// Allocate memory on the host
cudaMallocHost(&A, size);
cudaMallocHost(&B, size);
cudaMallocHost(&C, size);
// Initialize the vectors
for (int i = 0; i < n; i++)
{
A[i] = I;
B[i] = i * 2;
}
// Allocate memory on the device
int* dev_A, * dev_B, * dev_C;
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);
// Copy data from host to device
cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
// Launch the kernel
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);
// Copy data from device to host
cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
// Print the results
for (int i = 0; i < 10; i++)
{
cout << C[i] << " ";
}
cout << endl;
// Free memory
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);
return 0;
}
cpp bfs dfs
#include <iostream>
#include <vector>
#include <queue>
#include <stack>
#include <omp.h>
using namespace std;
class Graph {
int V;
vector<vector<int>> adjList;
public:
Graph(int V) {
this->V = V;
adjList.resize(V);
}
void addEdge(int src, int dest) {
adjList[src].push_back(dest);
adjList[dest].push_back(src); // For undirected graph
}
vector<int> getNeighbors(int vertex) {
return adjList[vertex];
}
};
void parallelBFS(Graph& graph, int source, vector<bool>& visited) {
queue<int> q;
q.push(source);
visited[source] = true;
while (!q.empty()) {
int current = q.front();
q.pop();
cout << "Visited: " << current << endl;
vector<int> neighbors = graph.getNeighbors(current);
#pragma omp parallel for
for (int i = 0; i < neighbors.size(); ++i) {
int neighbor = neighbors[i];
if (!visited[neighbor]) {
visited[neighbor] = true;
q.push(neighbor);
}
}
}
}
void parallelDFS(Graph& graph, int source, vector<bool>& visited) {
stack<int> s;
s.push(source);
visited[source] = true;
while (!s.empty()) {
int current = s.top();
s.pop();
cout << "Visited: " << current << endl;
vector<int> neighbors = graph.getNeighbors(current);
#pragma omp parallel for
for (int i = 0; i < neighbors.size(); ++i) {
int neighbor = neighbors[i];
if (!visited[neighbor]) {
visited[neighbor] = true;
s.push(neighbor);
}
}
}
}
int main() {
int V, E;
cout << "Enter the number of vertices: ";
cin >> V;
Graph graph(V);
cout << "Enter the number of edges: ";
cin >> E;
cout << "Enter the edges (src dest):" << endl;
for (int i = 0; i < E; ++i) {
int src, dest;
cin >> src >> dest;
graph.addEdge(src, dest);
}
vector<bool> visited(V, false);
cout << "Parallel BFS:" << endl;
#pragma omp parallel num_threads(2)
{
#pragma omp single nowait
parallelBFS(graph, 0, visited);
}
// Reset visited array for DFS
fill(visited.begin(), visited.end(), false);
cout << "Parallel DFS:" << endl;
#pragma omp parallel num_threads(2)
{
#pragma omp single nowait
parallelDFS(graph, 0, visited);
}
return 0;
}
Boston
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten
from sklearn import preprocessing
(X_train, Y_train), (X_test, Y_test) = keras.datasets.boston_housing.load_data()
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)
print("Train output data shape:", Y_train.shape)
print("Actual Test output data shape:", Y_test.shape)
##Normalize the data
X_train=preprocessing.normalize(X_train)
X_test=preprocessing.normalize(X_test)
#Model Building
X_train[0].shape
model = Sequential()
model.add(Dense(128,activation='relu',input_shape= X_train[0].shape))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(1))
model.summary()
model.compile(loss='mse',optimizer='rmsprop',metrics=['mae'])
history = model.fit(X_train,Y_train,epochs=100,batch_size=1,verbose=1,validation_data=(X_test,Y_test))
results = model.evaluate(X_test, Y_test)
print(results)
fashion
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
fashion_mnist = keras.datasets.fashion_mnist
(train_img, train_labels), (test_img, test_labels) = fashion_mnist.load_data()
train_img = train_img / 255.0
test_img = test_img / 255.0
model = keras.Sequential([keras.layers.Flatten(input_shape=(28, 28)),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dense(10, activation='softmax')])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(train_img, train_labels, epochs=10)
test_loss, test_acc = model.evaluate(test_img, test_labels)
print("accuracy of tessting: ",test_acc)
predictions = model.predict(test_img)
predicted_labels = np.argmax(predictions, axis=1)
num_rows = 5
num_cols = 5
num_imgs = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_imgs):
plt.subplot(num_rows, 2*num_cols, 2*i+1)
plt.imshow(test_img[1], cmap='gray')
plt.axis("off")
plt.subplot(num_rows, 2*num_cols, 2*i+2)
plt.bar(range(10), predictions[i])
plt.xticks(range(10))
plt.ylim([0,1])
plt.tight_layout()
plt.title(f"predicted_labels: {predicted_labels[i]}")
plt.show()
imdb
from keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
max([max(sequence) for sequence in train_data])
word_index = imdb.get_word_index()
reverse_word_index = dict([(val, key) for (key, val) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[0]])
import numpy as np
def vectorize(sequences, dimension=10000):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1
return results
x_train = vectorize(train_data)
x_test = vectorize(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy',optimizer='rmsprop', metrics = ['accuracy'])
x_val = x_train[:10000]
y_val = y_train[:10000]
partial_x = x_train[10000:]
partial_y = y_train[10000:]
history = model.fit(partial_x, partial_y, epochs=20, batch_size=512, validation_data=(x_val, y_val))
results = model.evaluate(x_test, y_test)
print(results)
min-max cpp
#include <iostream>
#include <vector>
#include <omp.h>
#include <climits>
using namespace std;
void min_reduction(vector<int>& arr) {
int min_value = INT_MAX;
#pragma omp parallel for reduction(min: min_value)
for (int i = 0; i < arr.size(); i++) {
if (arr[i] < min_value) {
min_value = arr[i];
}
}
cout << "Minimum value: " << min_value << endl;
}
void max_reduction(vector<int>& arr) {
int max_value = INT_MIN;
#pragma omp parallel for reduction(max: max_value)
for (int i = 0; i < arr.size(); i++) {
if (arr[i] > max_value) {
max_value = arr[i];
}
}
cout << "Maximum value: " << max_value << endl;
}
void sum_reduction(vector<int>& arr) {
int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < arr.size(); i++) {
sum += arr[i];
}
cout << "Sum: " << sum << endl;
}
void average_reduction(vector<int>& arr) {
int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < arr.size(); i++) {
sum += arr[i];
}
cout << "Average: " << (double)sum / arr.size() << endl;
}
int main() {
vector<int> arr;
arr.push_back(5);
arr.push_back(2);
arr.push_back(9);
arr.push_back(1);
arr.push_back(7);
arr.push_back(6);
arr.push_back(8);
arr.push_back(3);
arr.push_back(4);
min_reduction(arr);
max_reduction(arr);
sum_reduction(arr);
average_reduction(arr);
}
sorting
#include <iostream>
#include <vector>
#include <cstdlib>
#include <omp.h>
// Function to perform bubble sort
void bubbleSort(std::vector<int>& arr) {
int n = arr.size();
for (int i = 0; i < n-1; i++) {
for (int j = 0; j < n-i-1; j++) {
if (arr[j] > arr[j+1]) {
std::swap(arr[j], arr[j+1]);
}
}
}
}
// Function to merge two sorted subvectors
void merge(std::vector<int>& arr, int l, int m, int r) {
int n1 = m - l + 1;
int n2 = r - m;
std::vector<int> L(n1), R(n2);
for (int i = 0; i < n1; i++)
L[i] = arr[l + i];
for (int j = 0; j < n2; j++)
R[j] = arr[m + 1 + j];
int i = 0, j = 0, k = l;
while (i < n1 && j < n2) {
if (L[i] <= R[j]) {
arr[k] = L[i];
i++;
} else {
arr[k] = R[j];
j++;
}
k++;
}
while (i < n1) {
arr[k] = L[i];
i++;
k++;
}
while (j < n2) {
arr[k] = R[j];
j++;
k++;
}
}
// Function to perform merge sort
void mergeSort(std::vector<int>& arr, int l, int r) {
if (l < r) {
int m = l + (r - l) / 2;
#pragma omp parallel sections
{
#pragma omp section
mergeSort(arr, l, m);
#pragma omp section
mergeSort(arr, m + 1, r);
}
merge(arr, l, m, r);
}
}
// Function to print a vector
void printVector(const std::vector<int>& arr) {
for (int num : arr)
std::cout << num << " ";
std::cout << std::endl;
}
int main() {
int n = 10000; // Size of vector
std::vector<int> arr(n), arr_copy(n);
// Initialize vector with random values
srand(42);
for (int i = 0; i < n; i++) {
arr[i] = rand() % 10000;
arr_copy[i] = arr[i];
}
std::cout << "Original vector:" << std::endl;
printVector(arr);
// Sequential bubble sort
double start = omp_get_wtime();
bubbleSort(arr);
double end = omp_get_wtime();
std::cout << "\nSequential Bubble Sort: " << end - start << " seconds" << std::endl;
//printVector(arr);
// Parallel merge sort
start = omp_get_wtime();
mergeSort(arr_copy, 0, n - 1);
end = omp_get_wtime();
std::cout << "\nParallel Merge Sort: " << end - start << " seconds" << std::endl;
//printVector(arr_copy);
return 0;
}
💖 💪 🙅 🚩
Abhijeet Gavali
Posted on April 23, 2024
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.