Linda Sebastian
Posted on August 12, 2024
Introduction
Processing large CSV files efficiently is a common requirement in many applications, from data analysis to ETL (Extract, Transform, Load) processes. In this article, I want to benchmark the performance of four popular programming languages—Golang, NodeJS with NestJS, PHP, and Python—in handling large CSV files on a MacBook Pro M1. I aim to determine which language provides the best performance for this task.
Test Environment
Hardware: MacBook Pro M1, 256GB SSD, 8GB RAM
Software:
- macOS Sonoma 14.5
- PHP 8.3.6
- Golang 1.22.4
- Node.js 22.0.0 with NestJS
- Python 3.12.3
Test Data
I used a synthetic CSV file named sales_data.csv with approximately 1 million rows, each containing transaction details such as transaction_id, product_id, quantity, price, and timestamp.
Task Description
For each language, the script performs the following tasks:
- Reads the CSV file.
- Calculates the total sales amount.
- Identifies the product with the highest sales.
Implementation
Here are the scripts used for each language:
Golang Script:
sales.go
package main
import (
"encoding/csv"
"fmt"
"os"
"strconv"
"time"
)
func main() {
start := time.Now()
file, err := os.Open("../generate-csv/sales_data.csv")
if err != nil {
fmt.Println("Error:", err)
return
}
defer file.Close()
reader := csv.NewReader(file)
_, _ = reader.Read() // Skip header
totalSales := 0.0
productSales := make(map[string]float64)
for {
line, err := reader.Read()
if err != nil {
break
}
productID := line[1]
quantity, _ := strconv.Atoi(line[2])
price, _ := strconv.ParseFloat(line[3], 64)
total := float64(quantity) * price
totalSales += total
productSales[productID] += total
}
var topProduct string
var topSales float64
for product, sales := range productSales {
if sales > topSales {
topProduct = product
topSales = sales
}
}
elapsed := time.Since(start)
fmt.Printf("Golang Execution time: %s\n", elapsed)
fmt.Printf("Total Sales: $%.2f\n", totalSales)
fmt.Printf("Top Product: %s with sales $%.2f\n", topProduct, topSales)
}
NestJS Script:
csv.service.ts
import { Injectable } from '@nestjs/common';
import * as fs from 'fs';
import * as fastcsv from 'fast-csv';
// path file CSV
const GLOBAL_CSV_PATH = '../generate-csv/sales_data.csv';
@Injectable()
@Injectable()
export class CsvService {
async parseCsv(): Promise<{
nestExecutionTime: number;
totalSales: number;
topProductSales: number;
}> {
return new Promise((resolve, reject) => {
const startTime = process.hrtime();
let totalSales = 0;
const productSales: { [key: string]: number } = {};
fs.createReadStream(GLOBAL_CSV_PATH)
.pipe(fastcsv.parse({ headers: true, delimiter: ',' }))
.on('data', (row) => {
const productID = row.product_id;
const quantity = parseInt(row.quantity, 10);
const price = parseFloat(row.price);
const total = quantity * price;
totalSales += total;
if (!productSales[productID]) {
productSales[productID] = 0;
}
productSales[productID] += total;
})
.on('end', () => {
const topProduct = Object.keys(productSales).reduce((a, b) =>
productSales[a] > productSales[b] ? a : b,
);
const topProductSales = productSales[topProduct] || 0;
const endTime = process.hrtime(startTime);
const nestExecutionTime = endTime[0] + endTime[1] / 1e9;
console.log(`NestJS Execution time: ${nestExecutionTime} seconds`);
console.log(`Total Sales: $${totalSales}`);
console.log(
`Top Product: ${topProduct} with sales $${topProductSales}`,
);
resolve({
nestExecutionTime,
totalSales,
topProductSales,
});
})
.on('error', (error) => reject(error));
});
}
}
csv.controller.ts
import { Controller, Get } from '@nestjs/common';
import { CsvService } from './csv.service';
@Controller('csv')
export class CsvController {
constructor(private readonly csvService: CsvService) {}
@Get('parse')
async parseCsv(): Promise<{
nestExecutionTime: number;
totalSales: number;
topProductSales: number;
}> {
return this.csvService.parseCsv();
}
}
PHP Script
sales.php
<?php
$start_time = microtime(true);
$file = fopen("../generate-csv/sales_data.csv", "r");
$total_sales = 0;
$product_sales = [];
fgetcsv($file); // Skip header
while (($line = fgetcsv($file)) !== false) {
$product_id = $line[1];
$quantity = (int)$line[2];
$price = (float)$line[3];
$total = $quantity * $price;
$total_sales += $total;
if (!isset($product_sales[$product_id])) {
$product_sales[$product_id] = 0;
}
$product_sales[$product_id] += $total;
}
fclose($file);
arsort($product_sales);
$top_product = array_key_first($product_sales);
$end_time = microtime(true);
$execution_time = ($end_time - $start_time);
echo "PHP Execution time: ".$execution_time." seconds\n";
echo "Total Sales: $".$total_sales."\n";
echo "Top Product: ".$top_product." with sales $".$product_sales[$top_product]."\n";
Python Script
import csv
import time
# Input file name config
input_file = '../generate-csv/sales_data.csv'
def parse_csv(file_path):
start_time = time.time()
total_sales = 0
product_sales = {}
with open(file_path, mode='r') as file:
reader = csv.DictReader(file)
for row in reader:
product_id = row['product_id']
quantity = int(row['quantity'])
price = float(row['price'])
total = quantity * price
total_sales += total
if product_id not in product_sales:
product_sales[product_id] = 0
product_sales[product_id] += total
top_product = max(product_sales, key=product_sales.get)
execution_time = time.time() - start_time
return {
'total_sales': total_sales,
'top_product': top_product,
'top_product_sales': product_sales[top_product],
'execution_time': execution_time,
}
if __name__ == "__main__":
result = parse_csv(input_file)
print(f"Python Execution time: {result['execution_time']:.2f} seconds")
print(f"Total Sales: ${result['total_sales']:.2f}")
print(f"Top Product: {result['top_product']} with sales ${
result['top_product_sales']:.2f}")
Results
Here are the results of our benchmark tests:
Golang
- Execution time: 466.69975ms
- Total Sales: $274654985.36
- Top Product: Product 1126 with sales $305922.81
NestJS
- Execution time: 6.730134208 seconds
- Total Sales: $274654985.36000216
- Top Product: 1126 with sales $305922.8099999997
PHP
- Execution time: 1.5142710208893 seconds
- Total Sales: $274654985.36
- Top Product: 1126 with sales $305922.81
Python
- Execution time: 2.56 seconds
- Total Sales: $274654985.36
- Top Product: 1126 with sales $305922.81
Analysis
My benchmark reveals several interesting insights:
Execution Time: Golang performed the best in terms of execution time, followed closely by PHP8, while NestJS took the longest time to complete the task.
Memory Usage: Build NestJS demonstrated efficient memory usage, while Python showed higher memory consumption.
Ease of Implementation: Golang provided the most straightforward implementation, while NestJS required more lines of code and complexity.
Conclusion
Based on my findings, Golang offers the best performance speed and memory efficiency, making it an excellent choice for handling large datasets.
Complete Code
You can get the full code on My Github repository
csv-parsing-battle.
Update Refactor Go
I refactor the part of Go from two loops to one loop for simplicity.
package main
import (
"encoding/csv"
"fmt"
"os"
"strconv"
"time"
)
func main() {
start := time.Now()
file, err := os.Open("../generate-csv/sales_data.csv")
if err != nil {
fmt.Println("Error:", err)
return
}
defer file.Close()
reader := csv.NewReader(file)
_, _ = reader.Read() // Skip header
totalSales := 0.0
productSales := make(map[string]float64)
var topProduct string
var topSales float64
for {
line, err := reader.Read()
if err != nil {
break
}
productID := line[1]
quantity, _ := strconv.Atoi(line[2])
price, _ := strconv.ParseFloat(line[3], 64)
total := float64(quantity) * price
totalSales += total
productSales[productID] += total
// Update topProduct and topSales in the same loop
if productSales[productID] > topSales {
topProduct = productID
topSales = productSales[productID]
}
}
elapsed := time.Since(start)
fmt.Printf("Golang Execution time: %s\n", elapsed)
fmt.Printf("Total Sales: $%.2f\n", totalSales)
fmt.Printf("Top Product: %s with sales $%.2f\n", topProduct, topSales)
}
The result barely any difference.
- Execution time: 457.721917ms
- Total Sales: $274654985.36
- Top Product: Product 1126 with sales $305922.81
Posted on August 12, 2024
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.