Benchmarking CSV File Processing: Golang vs NestJS vs PHP vs Python

rocklinda

Linda Sebastian

Posted on August 12, 2024

Benchmarking CSV File Processing: Golang vs NestJS vs PHP vs Python

Introduction

Processing large CSV files efficiently is a common requirement in many applications, from data analysis to ETL (Extract, Transform, Load) processes. In this article, I want to benchmark the performance of four popular programming languages—Golang, NodeJS with NestJS, PHP, and Python—in handling large CSV files on a MacBook Pro M1. I aim to determine which language provides the best performance for this task.

Test Environment

Hardware: MacBook Pro M1, 256GB SSD, 8GB RAM

Software:

  • macOS Sonoma 14.5
  • PHP 8.3.6
  • Golang 1.22.4
  • Node.js 22.0.0 with NestJS
  • Python 3.12.3

Test Data

I used a synthetic CSV file named sales_data.csv with approximately 1 million rows, each containing transaction details such as transaction_id, product_id, quantity, price, and timestamp.

Task Description

For each language, the script performs the following tasks:

  1. Reads the CSV file.
  2. Calculates the total sales amount.
  3. Identifies the product with the highest sales.

Implementation

Here are the scripts used for each language:

Golang Script:

sales.go



package main

import (
    "encoding/csv"
    "fmt"
    "os"
    "strconv"
    "time"
)

func main() {
    start := time.Now()

    file, err := os.Open("../generate-csv/sales_data.csv")
    if err != nil {
        fmt.Println("Error:", err)
        return
    }
    defer file.Close()

    reader := csv.NewReader(file)
    _, _ = reader.Read() // Skip header

    totalSales := 0.0
    productSales := make(map[string]float64)

    for {
        line, err := reader.Read()
        if err != nil {
            break
        }
        productID := line[1]
        quantity, _ := strconv.Atoi(line[2])
        price, _ := strconv.ParseFloat(line[3], 64)
        total := float64(quantity) * price

        totalSales += total
        productSales[productID] += total
    }

    var topProduct string
    var topSales float64
    for product, sales := range productSales {
        if sales > topSales {
            topProduct = product
            topSales = sales
        }
    }

    elapsed := time.Since(start)
    fmt.Printf("Golang Execution time: %s\n", elapsed)
    fmt.Printf("Total Sales: $%.2f\n", totalSales)
    fmt.Printf("Top Product: %s with sales $%.2f\n", topProduct, topSales)
}



Enter fullscreen mode Exit fullscreen mode

NestJS Script:

csv.service.ts



import { Injectable } from '@nestjs/common';
import * as fs from 'fs';
import * as fastcsv from 'fast-csv';

// path file CSV
const GLOBAL_CSV_PATH = '../generate-csv/sales_data.csv';

@Injectable()
@Injectable()
export class CsvService {
  async parseCsv(): Promise<{
    nestExecutionTime: number;
    totalSales: number;
    topProductSales: number;
  }> {
    return new Promise((resolve, reject) => {
      const startTime = process.hrtime();

      let totalSales = 0;
      const productSales: { [key: string]: number } = {};

      fs.createReadStream(GLOBAL_CSV_PATH)
        .pipe(fastcsv.parse({ headers: true, delimiter: ',' }))
        .on('data', (row) => {
          const productID = row.product_id;
          const quantity = parseInt(row.quantity, 10);
          const price = parseFloat(row.price);
          const total = quantity * price;
          totalSales += total;
          if (!productSales[productID]) {
            productSales[productID] = 0;
          }
          productSales[productID] += total;
        })
        .on('end', () => {
          const topProduct = Object.keys(productSales).reduce((a, b) =>
            productSales[a] > productSales[b] ? a : b,
          );
          const topProductSales = productSales[topProduct] || 0;
          const endTime = process.hrtime(startTime);
          const nestExecutionTime = endTime[0] + endTime[1] / 1e9;

          console.log(`NestJS Execution time: ${nestExecutionTime} seconds`);
          console.log(`Total Sales: $${totalSales}`);
          console.log(
            `Top Product: ${topProduct} with sales $${topProductSales}`,
          );

          resolve({
            nestExecutionTime,
            totalSales,
            topProductSales,
          });
        })
        .on('error', (error) => reject(error));
    });
  }
}


Enter fullscreen mode Exit fullscreen mode

csv.controller.ts



import { Controller, Get } from '@nestjs/common';
import { CsvService } from './csv.service';

@Controller('csv')
export class CsvController {
  constructor(private readonly csvService: CsvService) {}

  @Get('parse')
  async parseCsv(): Promise<{
    nestExecutionTime: number;
    totalSales: number;
    topProductSales: number;
  }> {
    return this.csvService.parseCsv();
  }
}


Enter fullscreen mode Exit fullscreen mode

PHP Script

sales.php



<?php
$start_time = microtime(true);

$file = fopen("../generate-csv/sales_data.csv", "r");
$total_sales = 0;
$product_sales = [];

fgetcsv($file); // Skip header
while (($line = fgetcsv($file)) !== false) {
    $product_id = $line[1];
    $quantity = (int)$line[2];
    $price = (float)$line[3];
    $total = $quantity * $price;

    $total_sales += $total;
    if (!isset($product_sales[$product_id])) {
        $product_sales[$product_id] = 0;
    }
    $product_sales[$product_id] += $total;
}
fclose($file);

arsort($product_sales);
$top_product = array_key_first($product_sales);

$end_time = microtime(true);
$execution_time = ($end_time - $start_time);

echo "PHP Execution time: ".$execution_time." seconds\n";
echo "Total Sales: $".$total_sales."\n";
echo "Top Product: ".$top_product." with sales $".$product_sales[$top_product]."\n";


Enter fullscreen mode Exit fullscreen mode

Python Script



import csv
import time

# Input file name config
input_file = '../generate-csv/sales_data.csv'


def parse_csv(file_path):
    start_time = time.time()

    total_sales = 0
    product_sales = {}

    with open(file_path, mode='r') as file:
        reader = csv.DictReader(file)

        for row in reader:
            product_id = row['product_id']
            quantity = int(row['quantity'])
            price = float(row['price'])
            total = quantity * price
            total_sales += total

            if product_id not in product_sales:
                product_sales[product_id] = 0
            product_sales[product_id] += total

    top_product = max(product_sales, key=product_sales.get)
    execution_time = time.time() - start_time

    return {
        'total_sales': total_sales,
        'top_product': top_product,
        'top_product_sales': product_sales[top_product],
        'execution_time': execution_time,
    }


if __name__ == "__main__":
    result = parse_csv(input_file)
    print(f"Python Execution time: {result['execution_time']:.2f} seconds")
    print(f"Total Sales: ${result['total_sales']:.2f}")
    print(f"Top Product: {result['top_product']} with sales ${
          result['top_product_sales']:.2f}")



Enter fullscreen mode Exit fullscreen mode

Results

Here are the results of our benchmark tests:

Golang

  • Execution time: 466.69975ms
  • Total Sales: $274654985.36
  • Top Product: Product 1126 with sales $305922.81

golang result

NestJS

  • Execution time: 6.730134208 seconds
  • Total Sales: $274654985.36000216
  • Top Product: 1126 with sales $305922.8099999997

nestJS result

PHP

  • Execution time: 1.5142710208893 seconds
  • Total Sales: $274654985.36
  • Top Product: 1126 with sales $305922.81

PHP result

Python

  • Execution time: 2.56 seconds
  • Total Sales: $274654985.36
  • Top Product: 1126 with sales $305922.81

Python result

Analysis

My benchmark reveals several interesting insights:

Execution Time: Golang performed the best in terms of execution time, followed closely by PHP8, while NestJS took the longest time to complete the task.
Memory Usage: Build NestJS demonstrated efficient memory usage, while Python showed higher memory consumption.
Ease of Implementation: Golang provided the most straightforward implementation, while NestJS required more lines of code and complexity.

Conclusion

Based on my findings, Golang offers the best performance speed and memory efficiency, making it an excellent choice for handling large datasets.

Complete Code

You can get the full code on My Github repository
csv-parsing-battle.

Update Refactor Go

I refactor the part of Go from two loops to one loop for simplicity.



package main

import (
    "encoding/csv"
    "fmt"
    "os"
    "strconv"
    "time"
)

func main() {
    start := time.Now()

    file, err := os.Open("../generate-csv/sales_data.csv")
    if err != nil {
        fmt.Println("Error:", err)
        return
    }
    defer file.Close()

    reader := csv.NewReader(file)
    _, _ = reader.Read() // Skip header

    totalSales := 0.0
    productSales := make(map[string]float64)

    var topProduct string
    var topSales float64

    for {
        line, err := reader.Read()
        if err != nil {
            break
        }

        productID := line[1]
        quantity, _ := strconv.Atoi(line[2])
        price, _ := strconv.ParseFloat(line[3], 64)
        total := float64(quantity) * price

        totalSales += total
        productSales[productID] += total

        // Update topProduct and topSales in the same loop
        if productSales[productID] > topSales {
            topProduct = productID
            topSales = productSales[productID]
        }
    }

    elapsed := time.Since(start)
    fmt.Printf("Golang Execution time: %s\n", elapsed)
    fmt.Printf("Total Sales: $%.2f\n", totalSales)
    fmt.Printf("Top Product: %s with sales $%.2f\n", topProduct, topSales)
}


Enter fullscreen mode Exit fullscreen mode

The result barely any difference.

  • Execution time: 457.721917ms
  • Total Sales: $274654985.36
  • Top Product: Product 1126 with sales $305922.81 Go Refactor
💖 💪 🙅 🚩
rocklinda
Linda Sebastian

Posted on August 12, 2024

Join Our Newsletter. No Spam, Only the good stuff.

Sign up to receive the latest update from our blog.

Related