Convert XLSX file with multi level header to plain CSV and JSON

Convert XLSX file with multi level header to plain CSV or JSON file with all headers information.

Some systems exports data in table format with multiheader structure, that looks something like this.

1

If you need to use this data in some of your applications you can manually merge headers, but if this file is updated frequently these can be tediously. In this case I can offer you JS script that will do this work for you.

At the end of itexecution you will get csv like this. As you can see all headers are grouped and merged into one level:

2

Or JSON like this:

3

Script uses two packages: XLSX, json2csv. To install packages use:

npm install --save json2csv xlsx

Function that makes all work is:

const XLSX = require('xlsx');
const fs = require('fs');
const util = require('util');
const {Parser} = require('json2csv');

const xlxsToCSVSync = (file, fileOutput, skipTopLines) => {
	const letterToNumber = (val) => {
		var base = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', i, j, result = 0;
		for (i = 0, j = val.length - 1; i < val.length; i += 1, j -= 1) {
			result += Math.pow(base.length, j) * (base.indexOf(val[i]) + 1);
		}
		return result;
	};

	const cellParsing = (cellName) => {
		let match = cellName.match(/([A-Za-z]+)(\d+)/);
		if(!match) return false;
		return {c: letterToNumber(match[1])-1, r: match[2]-1}
	}

	if(!fs.existsSync(file)){
		throw `File ${file} does not exists`
	}

	const xlsxData = fs.readFileSync(file);
	const workbook = XLSX.read(xlsxData, {type:'buffer'});
	
	if(Object.keys(workbook.Sheets).length > 1) throw `Excel contains more than one page`;
	let sheet = Object.keys(workbook.Sheets)[0];

	// define header height
	let headerHeight = 1
	let headerLeftTop, headerRightBottom;

	for(let cell in workbook.Sheets[sheet]){
		let coor = cellParsing(cell);
		if(!coor) continue;
		if(coor.r < skipTopLines) continue;
		if(coor.r >= (skipTopLines + headerHeight)) break;

		let findMerge = workbook.Sheets[sheet]['!merges'].find(x => x.s.c == coor.c && x.s.r == coor.r);
		if(findMerge){
			let width = findMerge.e.c - findMerge.s.c + 1
			let height = findMerge.e.r - findMerge.s.r + 1
			if(height > 1){
				headerHeight = coor.r - skipTopLines + height
			}
		}

		if(typeof headerLeftTop == 'undefined' || (coor.c < headerLeftTop.coor.c && coor.r < headerLeftTop.coor.r)) headerLeftTop = {cell, coor};
		if(typeof headerRightBottom == 'undefined' || (coor.c > headerLeftTop.coor.c && coor.r > headerLeftTop.coor.r)) headerRightBottom = {cell, coor};
	}

	// define header cells
	for(let cell in workbook.Sheets[sheet]){
		let coor = cellParsing(cell);
		if(!coor) continue;
		if(coor.r < skipTopLines) continue;
		if(coor.r >= (skipTopLines + headerHeight)) break;

		let findMerge = workbook.Sheets[sheet]['!merges'].find(x => x.s.c == coor.c && x.s.r == coor.r);
		if(findMerge){
			let width = findMerge.e.c - findMerge.s.c + 1
			let height = findMerge.e.r - findMerge.s.r + 1
		}
	}

	console.log(`Header height: ${headerHeight}`);

	// header define
	let headers = []
	const headerParser = (topLevel, line, leftOffset, widthParent) => {
		if(typeof leftOffset == 'undefined') leftOffset = 0;
		if(typeof widthParent == 'undefined') widthParent = 0;
		// let childLeftOffset = leftOffset;

		for (let cell in workbook.Sheets[sheet]){
			let coor = cellParsing(cell);
			if(!coor) continue;

			// row filter
			if(coor.r < skipTopLines) continue;
			if((coor.r + 1) != (skipTopLines + topLevel)) continue;
			if(coor.r >= (skipTopLines + headerHeight)) break;

			// column filter
			if(coor.c < leftOffset) continue;
			if(widthParent > 0 && (coor.c + 1) > (leftOffset + widthParent)) break;

			let height, width;
			let findMerge = workbook.Sheets[sheet]['!merges'].find(x => x.s.c == coor.c && x.s.r == coor.r);
			if(findMerge){
				width = findMerge.e.c - findMerge.s.c + 1
				height = findMerge.e.r - findMerge.s.r + 1
			} else {
				height = 1
				width = 1
			}

			let cellValue = workbook.Sheets[sheet][cell].v;
			if(!cellValue) continue;

			let curLine = line
			if(line != '') curLine += '.';
			curLine += cellValue;

			if((height + topLevel - 1) == headerHeight){
				headers.push({
					name: curLine,
					column: coor.c, 
					width
				})
				// headers.push(curLine)
			} else if((height + topLevel - 1) < headerHeight){
				headerParser((topLevel + height), curLine, coor.c, width);
			}
		}
	}
	headerParser(1, '');

	console.log(`Number of data columns: ${headers.length}`)

	// default row object
	let defaultRow = {}
	let headersList = [] 
	headers.forEach(h => {
		defaultRow[h.name] = null;
		headersList.push(h.name);
	})

	// define header cells
	let data = []

	for(let cell in workbook.Sheets[sheet]){
		let coor = cellParsing(cell);
		if(coor.r < (skipTopLines + headerHeight)) continue;
		if(!coor) continue;

		let rowNumber = coor.r - headerHeight - skipTopLines;
		if(rowNumber < 0){
			console.log(`${cell} is lower 0 (${rowNumber})`)
			process.exit();
		}
		if(typeof data[rowNumber] == 'undefined'){
			data[rowNumber] = Object.assign({}, defaultRow);
		}
		
		let cellValue = workbook.Sheets[sheet][cell].v;
		if(!cellValue) continue;		

		let column = headers.find(x => x.column == coor.c);
		if(column){
			data[rowNumber][column.name] = cellValue;
		}
	}

	const json2csvParser = new Parser({ headersList });
	const csv = json2csvParser.parse(data);

	console.log(`Data rows: ${data.length}`);

	fs.writeFileSync(fileOutput, csv)
	return data;
}

module.exports = xlxsToCSVSync;

To execute it use next script. Make sure that previous function is located in the same directory and is called excelToCSV.js or make appropriate changes.

Also you need to change path to file on 11 line. Additionally you can specify number of rows that should be skipped in source xlsx file.

const fs = require('fs')
const path = require('path');

const xlxsToCSVSync = require(path.join(__dirname,'excelToCSV.js'));

console.log(`To convert XLSX file to CSV and JSON formats change variables 'fileName' and 'skipLines' in script.\n` + 
			`  'skipLines' variable defines number of lines that will be skipped at the top of xlxs file\n`+
			`XLSX file must contain only one page sheet to be procedeed\n`);

// -------------- CHANGED VARIABLES --------------
const fileName = './table.xlsx'
const skipLines = 0
// -----------------------------------------------

const filePath = path.join(__dirname, fileName);
const parse = path.parse(filePath);

console.log(`Try to convert file '${fileName}' (${filePath})`)

if(!fs.existsSync(filePath)){
	console.log(`File '${filePath}' does not exists`);
	process.exit();
}

(async () => {
	let data = xlxsToCSVSync(filePath, path.join(parse.dir, `${parse.base}.csv`), skipLines);
	fs.writeFileSync(path.join(parse.dir, `${parse.base}.json`), JSON.stringify(data))
})()

In my case files structure of folder with scripts looks like:

├── excelToCSV.js
├── exeScript.js
├── node_modules
│   ├── ...
├── package-lock.json
├── package.json
└── table.xlsx
├── table.xlsx.csv
└── table.xlsx.json