1 /** 2 Convert CSV formatted data to TSV format. 3 4 This program converts comma-separated value data to tab-separated format. 5 6 Copyright (c) 2016-2020, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 12 module tsv_utils.csv2tsv; 13 14 import std.stdio; 15 import std.exception : enforce; 16 import std.format : format; 17 import std.range; 18 import std.traits : isArray, Unqual; 19 import std.typecons : tuple; 20 21 immutable helpText = q"EOS 22 Synopsis: csv2tsv [options] [file...] 23 24 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records 25 are read from files or standard input, converted records written to standard output. 26 Use '--help-verbose' for details the CSV formats accepted. 27 28 Options: 29 EOS"; 30 31 immutable helpTextVerbose = q"EOS 32 Synopsis: csv2tsv [options] [file...] 33 34 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records 35 are read from files or standard input, converted records written to standard output. 36 37 Both formats represent tabular data, each record on its own line, fields separated 38 by a delimiter character. The key difference is that CSV uses escape sequences to 39 represent newlines and field separators in the data, whereas TSV disallows these 40 characters in the data. The most common field delimiters are comma for CSV and tab 41 for TSV, but any character can be used. 42 43 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, 44 and replacing newlines and field delimiters in the data. By default, newlines and 45 field delimiters in the data are replaced by spaces. Most details are customizable. 46 47 There is no single spec for CSV, any number of variants can be found. The escape 48 syntax is common enough: fields containing newlines or field delimiters are placed 49 in double quotes. Inside a quoted field, a double quote is represented by a pair of 50 double quotes. As with field separators, the quoting character is customizable. 51 52 Behaviors of this program that often vary between CSV implementations: 53 * Newlines are supported in quoted fields. 54 * Double quotes are permitted in a non-quoted field. However, a field starting 55 with a quote must follow quoting rules. 56 * Each record can have a different numbers of fields. 57 * The three common forms of newlines are supported: CR, CRLF, LF. Output is 58 written using Unix newlines (LF). 59 * A newline will be added if the file does not end with one. 60 * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed. 61 * No whitespace trimming is done. 62 63 This program does not validate CSV correctness, but will terminate with an error 64 upon reaching an inconsistent state. Improperly terminated quoted fields are the 65 primary cause. 66 67 UTF-8 input is assumed. Convert other encodings prior to invoking this tool. 68 69 Options: 70 EOS"; 71 72 /** Container for command line options. 73 */ 74 struct Csv2tsvOptions 75 { 76 string programName; 77 bool helpVerbose = false; // --help-verbose 78 bool hasHeader = false; // --H|header 79 char csvQuoteChar = '"'; // --q|quote 80 char csvDelimChar = ','; // --c|csv-delim 81 char tsvDelimChar = '\t'; // --t|tsv-delim 82 string tsvDelimReplacement = " "; // --r|tab-replacement 83 string newlineReplacement = " "; // --n|newline-replacement 84 bool versionWanted = false; // --V|version 85 86 auto processArgs (ref string[] cmdArgs) 87 { 88 import std.algorithm : canFind; 89 import std.getopt; 90 import std.path : baseName, stripExtension; 91 92 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 93 94 try 95 { 96 auto r = getopt( 97 cmdArgs, 98 "help-verbose", " Print full help.", &helpVerbose, 99 std.getopt.config.caseSensitive, 100 "H|header", " Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader, 101 std.getopt.config.caseSensitive, 102 "q|quote", "CHR Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar, 103 "c|csv-delim", "CHR Field delimiter in CSV data. Default: comma (,).", &csvDelimChar, 104 "t|tsv-delim", "CHR Field delimiter in TSV data. Default: TAB", &tsvDelimChar, 105 "r|tab-replacement", "STR Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space.", &tsvDelimReplacement, 106 "n|newline-replacement", "STR Replacement for newlines found in CSV input. Default: Space.", &newlineReplacement, 107 std.getopt.config.caseSensitive, 108 "V|version", " Print version information and exit.", &versionWanted, 109 std.getopt.config.caseInsensitive, 110 ); 111 112 if (r.helpWanted) 113 { 114 defaultGetoptPrinter(helpText, r.options); 115 return tuple(false, 0); 116 } 117 else if (helpVerbose) 118 { 119 defaultGetoptPrinter(helpTextVerbose, r.options); 120 return tuple(false, 0); 121 } 122 else if (versionWanted) 123 { 124 import tsv_utils.common.tsvutils_version; 125 writeln(tsvutilsVersionNotice("csv2tsv")); 126 return tuple(false, 0); 127 } 128 129 /* Consistency checks. */ 130 enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r', 131 "CSV quote character cannot be newline (--q|quote)."); 132 133 enforce(csvQuoteChar != csvDelimChar, 134 "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim)."); 135 136 enforce(csvQuoteChar != tsvDelimChar, 137 "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim)."); 138 139 enforce(csvDelimChar != '\n' && csvDelimChar != '\r', 140 "CSV field delimiter cannot be newline (--c|csv-delim)."); 141 142 enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r', 143 "TSV field delimiter cannot be newline (--t|tsv-delim)."); 144 145 enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement), 146 "Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement)."); 147 148 enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(newlineReplacement), 149 "Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement)."); 150 } 151 catch (Exception exc) 152 { 153 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 154 return tuple(false, 1); 155 } 156 return tuple(true, 0); 157 } 158 } 159 160 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 161 162 version(unittest) 163 { 164 // No main in unittest 165 } 166 else 167 { 168 int main(string[] cmdArgs) 169 { 170 /* When running in DMD code coverage mode, turn on report merging. */ 171 version(D_Coverage) version(DigitalMars) 172 { 173 import core.runtime : dmd_coverSetMerge; 174 dmd_coverSetMerge(true); 175 } 176 177 Csv2tsvOptions cmdopt; 178 const r = cmdopt.processArgs(cmdArgs); 179 if (!r[0]) return r[1]; 180 version(LDC_Profile) 181 { 182 import ldc.profile : resetAll; 183 resetAll(); 184 } 185 try csv2tsvFiles(cmdopt, cmdArgs[1..$]); 186 catch (Exception exc) 187 { 188 writeln(); 189 stdin.flush(); 190 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 191 return 1; 192 } 193 194 return 0; 195 } 196 } 197 198 void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles) 199 { 200 import tsv_utils.common.utils : BufferedOutputRange; 201 202 ubyte[1024 * 128] fileRawBuf; 203 auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout); 204 bool firstFile = true; 205 206 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 207 { 208 auto inputStream = (filename == "-") ? stdin : filename.File; 209 auto printFileName = (filename == "-") ? "stdin" : filename; 210 211 auto skipLines = (firstFile || !cmdopt.hasHeader) ? 0 : 1; 212 213 csv2tsv(inputStream, stdoutWriter, fileRawBuf, printFileName, skipLines, 214 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 215 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement, 216 cmdopt.newlineReplacement); 217 218 firstFile = false; 219 } 220 } 221 222 /* csv2tsv buffered conversion approach 223 224 This version of csv2tsv uses a buffered approach to csv-to-tsv conversion. This is a 225 change from the original version, which used a character-at-a-time approach, with 226 characters coming from an infinite stream of characters. The character-at-a-time 227 approach was nice from a simplicity perspective, but the approach didn't optimize well. 228 Note that the original version read input in blocks and wrote to stdout in blocks, it 229 was the conversion algorithm itself that was character oriented. 230 231 The idea is to convert a buffer at a time, writing larger blocks to the output stream 232 rather than one character at a time. In addition, the read buffer is modified in-place 233 when the only change is to convert a single character. The notable case is converting 234 the field delimiter character, typically comma to TAB. The result is writing longer 235 blocks to the output stream (BufferedOutputRange). 236 237 Performance improvements from the new algorithm are notable. This is especially true 238 versus the previous version 2.0.0. Note though that the more recent versions of 239 csv2tsv were slower due to degradations coming from compiler and/or language version. 240 Version 1.1.19 was quite a bit faster. Regardless of version, the performance 241 improvement is especially good when run against "simple" CSV files, with limited 242 amounts of CSV escape syntax. In these files the main change is converting the field 243 delimiter character, typically comma to TAB. 244 245 In some benchmarks on Mac OS, the new version was 40% faster than csv2tsv 2.0.0 on 246 files with significant CSV escapes, and 60% faster on files with limited CSV escapes. 247 Versus csv2tsv version 1.1.19, the new version is 10% and 40% faster on the same 248 files. On the "simple CSV" file, where Unix 'tr' is an option, 'tr' was still faster, 249 by about 20%. But getting into the 'tr' ballpark while retaining safety of correct 250 csv2tsv conversion is a good result. 251 252 Algorithm notes: 253 254 The algorithm works by reading an input block, then examining each byte in-order to 255 identify needed modifications. The region of consecutive characters without a change 256 is tracked. Single character changes are done in-place, in the read buffer. This 257 allows assembling longer blocks before write is needed. The region being tracked is 258 written to the output stream when it can no longer be extended in a continuous 259 fashion. At this point a new region is started. When the current read buffer has 260 been processed the current region is written out and a new block of data read in. 261 262 The read buffer uses fixed size blocks. This means the algorithm is actually 263 operating on bytes (UTF-8 code units), and not characters. This works because all 264 delimiters and CSV escape syntax characters are single byte UTF-8 characters. These 265 are the only characters requiring interpretation. The main nuisance is the 2-byte 266 CRLF newline sequence, as this might be split across two read buffers. This is 267 handled by embedding 'CR' states in the finite state machine. 268 269 Processing CSV escapes will often cause the character removals and additions. These 270 will not be representable in a continuous stream of bytes without moving bytes around 271 Instead of moving bytes, these cases are handled by immediately writing to the output 272 stream. This allows restarting a new block of contiguous characters. Handling by the 273 new algorithm is described below. Note that the length of the replacement characters 274 for TSV field and record delimiters (e.g. TAB, newline) affects the processing. 275 276 All replacement character lengths: 277 278 * Windows newline (CRLF) at the end of a line - Replace the CRLF with LF. 279 280 Replace the CR with LF, add it to the current write region and terminate it. The 281 next write region starts at the character after the LF. 282 283 * Double quote starting or ending a field - Drop the double quote. 284 285 Terminate the current write region, next write region starts at the next character. 286 287 * Double quote pair inside a quoted field - Drop one of the double quotes. 288 289 The algorithm drops the first double quote and keep the second. This avoids 290 look-ahead and both field terminating double quote and double quote pair can 291 handled the same way. Terminate the current write region without adding the double 292 quote. The next write region starts at the next character. 293 294 Single byte replacement characters: 295 296 * Windows newline (CRLF) in a quoted field 297 298 Replace the CR with the replacement char, add it to the current write region and 299 terminate it. The next write region starts at the character after the LF. 300 301 Multi-byte replacement sequences: 302 303 * TSV Delimiter (TAB by default) in a field 304 305 Terminate the current write region, write it out and the replacement. The next 306 write region starts at the next character. 307 308 * LF, CR, or CRLF in a quoted field 309 310 Terminate the current write region, write it and the replacement. The next write 311 region starts at the next character. 312 313 csv2tsv API 314 315 At the API level, it is desirable to handle at both open files and input streams. 316 Open files are the key requirement, but handling input streams simplifies unit 317 testing, and in-memory conversion is likely to be useful anyway. Internally, it 318 should be easy enough to encapsulate the differences between input streams and files. 319 Reading files can be done using File.byChunk and reading from input streams can be 320 done using std.range.chunks. 321 322 This has been handled by creating a new range that can iterate either files or 323 input streams chunk-by-chunk. 324 */ 325 326 /** Defines the 'bufferable' input sources supported by inputSourceByChunk. 327 * 328 * This includes std.stdio.File objects and mutable dynamic ubyte arrays (inputRange 329 * with slicing). 330 * 331 * Note: The mutable, dynamic arrays restriction is based on what is supported by 332 * std.range.chunks. This could be extended to include any type of array with ubyte 333 * elements, but it would require custom code in inputSourceByChunk. A test could be 334 * added as '(isArray!(R) && is(Unqual!(typeof(R.init[0])) == ubyte))'. 335 */ 336 enum bool isBufferableInputSource(R) = 337 isFileHandle!(Unqual!R) || 338 (isInputRange!R && is(ElementEncodingType!R == ubyte) && hasSlicing!R); 339 340 @safe unittest 341 { 342 static assert(isBufferableInputSource!(File)); 343 static assert(isBufferableInputSource!(typeof(stdin))); 344 static assert(isBufferableInputSource!(ubyte[])); 345 static assert(!isBufferableInputSource!(char[])); 346 static assert(!isBufferableInputSource!(string)); 347 348 ubyte[10] x1; 349 const ubyte[1] x2; 350 immutable ubyte[1] x3; 351 ubyte[] x4 = new ubyte[](10); 352 const ubyte[] x5 = new ubyte[](10); 353 immutable ubyte[] x6 = new ubyte[](10); 354 355 static assert(!isBufferableInputSource!(typeof(x1))); 356 static assert(!isBufferableInputSource!(typeof(x2))); 357 static assert(!isBufferableInputSource!(typeof(x3))); 358 static assert(isBufferableInputSource!(typeof(x4))); 359 static assert(!isBufferableInputSource!(typeof(x5))); 360 static assert(!isBufferableInputSource!(typeof(x6))); 361 362 static assert(is(Unqual!(ElementType!(typeof(x1))) == ubyte)); 363 static assert(is(Unqual!(ElementType!(typeof(x2))) == ubyte)); 364 static assert(is(Unqual!(ElementType!(typeof(x3))) == ubyte)); 365 static assert(is(Unqual!(ElementType!(typeof(x4))) == ubyte)); 366 static assert(is(Unqual!(ElementType!(typeof(x5))) == ubyte)); 367 static assert(is(Unqual!(ElementType!(typeof(x6))) == ubyte)); 368 369 struct S1 370 { 371 void popFront(); 372 @property bool empty(); 373 @property ubyte front(); 374 } 375 376 struct S2 377 { 378 @property ubyte front(); 379 void popFront(); 380 @property bool empty(); 381 @property auto save() { return this; } 382 @property size_t length(); 383 S2 opSlice(size_t, size_t); 384 } 385 386 static assert(isInputRange!S1); 387 static assert(!isBufferableInputSource!S1); 388 389 static assert(isInputRange!S2); 390 static assert(is(ElementEncodingType!S2 == ubyte)); 391 static assert(hasSlicing!S2); 392 static assert(isBufferableInputSource!S2); 393 394 /* For code coverage. */ 395 S2 s2; 396 auto x = s2.save; 397 } 398 399 /** inputSourceByChunk returns a range that reads either a file handle (File) or a 400 * ubyte[] array a chunk at a time. 401 * 402 * This is a cover for File.byChunk that allows passing an in-memory array as well. 403 * At present the motivation is primarily to enable unit testing of chunk-based 404 * algorithms using in-memory strings. At present the in-memory input types are 405 * limited. In the future this may be changed to accept any type of character or 406 * ubyte array. 407 * 408 * inputSourceByChunk takes either a File open for reading or a ubyte[] array 409 * containing input data. Data is read a buffer at a time. The buffer can be 410 * user provided, or allocated by inputSourceByChunk based on a caller provided 411 * buffer size. 412 * 413 * A ubyte[] input source must satisfy isBufferableInputSource, which at present 414 * means that it is a dynamic, mutable ubyte[]. 415 * 416 * The chunks are returned as an input range. 417 */ 418 419 auto inputSourceByChunk(InputSource)(InputSource source, size_t size) 420 { 421 return inputSourceByChunk(source, new ubyte[](size)); 422 } 423 424 /// Ditto 425 auto inputSourceByChunk(InputSource)(InputSource source, ubyte[] buffer) 426 if (isBufferableInputSource!InputSource) 427 { 428 static if (isFileHandle!(Unqual!InputSource)) 429 { 430 return source.byChunk(buffer); 431 } 432 else 433 { 434 static struct BufferedChunk 435 { 436 private Chunks!InputSource _chunks; 437 private ubyte[] _buffer; 438 439 private void readNextChunk() 440 { 441 if (_chunks.empty) 442 { 443 _buffer.length = 0; 444 } 445 else 446 { 447 size_t len = _chunks.front.length; 448 _buffer[0 .. len] = _chunks.front[]; 449 _chunks.popFront; 450 451 /* Only the last chunk should be shorter than the buffer. */ 452 assert(_buffer.length == len || _chunks.empty); 453 454 if (_buffer.length != len) _buffer.length = len; 455 } 456 } 457 458 this(InputSource source, ubyte[] buffer) 459 { 460 enforce(buffer.length > 0, "buffer size must be larger than 0"); 461 _chunks = source.chunks(buffer.length); 462 _buffer = buffer; 463 readNextChunk(); 464 } 465 466 @property bool empty() 467 { 468 return (_buffer.length == 0); 469 } 470 471 @property ubyte[] front() 472 { 473 assert(!empty, "Attempting to fetch the front of an empty inputSourceByChunks"); 474 return _buffer; 475 } 476 477 void popFront() 478 { 479 assert(!empty, "Attempting to popFront an empty inputSourceByChunks"); 480 readNextChunk(); 481 } 482 } 483 484 return BufferedChunk(source, buffer); 485 } 486 } 487 488 unittest // inputSourceByChunk 489 { 490 import tsv_utils.common.unittest_utils; // tsv-utils unit test helpers 491 import std.file : mkdir, rmdirRecurse; 492 import std.path : buildPath; 493 494 auto testDir = makeUnittestTempDir("csv2tsv_inputSourceByChunk"); 495 scope(exit) testDir.rmdirRecurse; 496 497 import std.algorithm : equal, joiner; 498 import std.format; 499 import std.string : representation; 500 501 auto charData = "abcde,ßÀß,あめりか物語,012345"; 502 ubyte[] ubyteData = charData.dup.representation; 503 504 ubyte[1024] rawBuffer; // Must be larger than largest bufferSize in tests. 505 506 void writeFileData(string filePath, ubyte[] data) 507 { 508 import std.stdio; 509 510 auto f = filePath.File("w"); 511 f.rawWrite(data); 512 f.close; 513 } 514 515 foreach (size_t dataSize; 0 .. ubyteData.length) 516 { 517 auto data = ubyteData[0 .. dataSize]; 518 auto filePath = buildPath(testDir, format("data_%d.txt", dataSize)); 519 writeFileData(filePath, data); 520 521 foreach (size_t bufferSize; 1 .. dataSize + 2) 522 { 523 assert(data.inputSourceByChunk(bufferSize).joiner.equal(data), 524 format("[Test-A] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 525 526 assert (rawBuffer.length >= bufferSize); 527 528 ubyte[] buffer = rawBuffer[0 .. bufferSize]; 529 assert(data.inputSourceByChunk(buffer).joiner.equal(data), 530 format("[Test-B] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 531 532 { 533 auto inputStream = filePath.File; 534 assert(inputStream.inputSourceByChunk(bufferSize).joiner.equal(data), 535 format("[Test-C] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 536 inputStream.close; 537 } 538 539 { 540 auto inputStream = filePath.File; 541 assert(inputStream.inputSourceByChunk(buffer).joiner.equal(data), 542 format("[Test-D] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 543 inputStream.close; 544 } 545 } 546 } 547 } 548 549 /** Read CSV from an input source, covert to TSV and write to an output source. 550 * 551 * Params: 552 * inputSource = A "bufferable" input source, either a file open for 553 * read, or a dynamic, mutable ubyte array. 554 * outputStream = An output range to write TSV bytes to. 555 * readBuffer = A buffer to use for reading. 556 * filename = Name of file to use when reporting errors. A descriptive 557 * name can be used in lieu of a file name. 558 * skipLines = Number of lines to skip before outputting records. 559 * Typically used to skip writing header lines. 560 * csvQuote = The quoting character used in the CSV input. 561 * csvDelim = The field delimiter character used in the CSV input. 562 * tsvDelim = The field delimiter character to use in the TSV output. 563 * tsvDelimReplacement = String to use when replacing TSV field delimiters 564 * (e.g. TABs) found in the CSV data fields. 565 * tsvNewlineReplacement = String to use when replacing newlines found in the CSV 566 * data fields. 567 * discardBOM = If true (the default), a UTF-8 Byte Order Mark found at the 568 * start of the input stream will be dropped. 569 * 570 * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and 571 * line number where the error was identified. 572 */ 573 void csv2tsv(InputSource, OutputRange)( 574 InputSource inputSource, 575 auto ref OutputRange outputStream, 576 ubyte[] readBuffer, 577 string filename = "(none)", 578 size_t skipLines = 0, 579 const char csvQuote = '"', 580 const char csvDelim = ',', 581 const char tsvDelim = '\t', 582 const string tsvDelimReplacement = " ", 583 const string tsvNewlineReplacement = " ", 584 bool discardBOM = true, 585 ) 586 if (isBufferableInputSource!InputSource && 587 isOutputRange!(OutputRange, char)) 588 { 589 import std.conv: hexString; 590 591 assert (readBuffer.length >= 1); 592 593 enum char LF = '\n'; 594 enum char CR = '\r'; 595 596 enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf"; 597 598 /* Process state information - These variables are defined either in the outer 599 * context or within one of the foreach loops. 600 * 601 * * recordNum - The current CSV input line/record number. Starts at one. 602 * * fieldNum - Field number in the current line/record. Field numbers are 603 * one-upped. The field number set to zero at the start of a new record, 604 * prior to processing the first character of the first field on the record. 605 * * byteIndex - Read buffer index of the current byte being processed. 606 * * csvState - The current state of CSV processing. In particular, the state 607 * of the finite state machine. 608 * * writeRegionStart - Read buffer index where the next write starts from. 609 * * nextIndex - The index of the current input ubyte being processed. The 610 * current write region extends from the writeRegionStart to nextIndex. 611 * * nextChar - The current input ubyte. The ubyte/char at nextIndex. 612 */ 613 614 enum CSVState 615 { 616 FieldEnd, // Start of input or after consuming a field or record delimiter. 617 NonQuotedField, // Processing a non-quoted field 618 QuotedField, // Processing a quoted field 619 QuoteInQuotedField, // Last char was a quote in a quoted field 620 CRAtFieldEnd, // Last char was a CR terminating a record/line 621 CRInQuotedField, // Last char was a CR in a quoted field 622 } 623 624 CSVState csvState = CSVState.FieldEnd; 625 size_t recordNum = 1; 626 size_t fieldNum = 0; 627 628 foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate) 629 { 630 size_t writeRegionStart = 0; 631 632 /* Discard byte order marks at the start of input. 633 * Note: Slicing the chunk in this fashion generates very good code, better 634 * other approaches like manipulating indices. 635 */ 636 auto inputChunk = 637 (discardBOM && 638 chunkIndex == 0 && 639 inputChunkComplete.length >= UTF8_BOM.length && 640 inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM 641 ) 642 ? inputChunkComplete[UTF8_BOM.length .. $] 643 : inputChunkComplete[]; 644 645 /* flushCurrentRegion flushes the current write region and moves the start of 646 * the next write region one byte past the end of the current region. If 647 * appendChars are provided they are ouput as well. 648 * 649 * This routine is called when the current character (byte) terminates the 650 * current write region and should not itself be output. That is why the next 651 * write region always starts one byte past the current region end. 652 * 653 * This routine is also called when the 'skiplines' region has been processed. 654 * This is done to flush the region without actually writing it. This is done 655 * by explicit checks in the finite state machine when newline characters 656 * that terminate a record are processed. It would be nice to refactor this. 657 */ 658 void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "") 659 { 660 assert(regionEnd <= inputChunk.length); 661 662 if (recordNum > skipLines) 663 { 664 if (regionEnd > writeRegionStart) 665 { 666 outputStream.put(inputChunk[writeRegionStart .. regionEnd]); 667 } 668 if (appendChars.length > 0) 669 { 670 outputStream.put(appendChars); 671 } 672 } 673 674 writeRegionStart = regionEnd + 1; 675 } 676 677 foreach (size_t nextIndex, char nextChar; inputChunk) 678 { 679 OuterSwitch: final switch (csvState) 680 { 681 case CSVState.FieldEnd: 682 /* Start of input or after consuming a field terminator. */ 683 ++fieldNum; 684 685 /* Note: Can't use switch due to the 'goto case' to the OuterSwitch. */ 686 if (nextChar == csvQuote) 687 { 688 flushCurrentRegion(nextIndex); 689 csvState = CSVState.QuotedField; 690 break OuterSwitch; 691 } 692 else 693 { 694 /* Processing state change only. Don't consume the character. */ 695 csvState = CSVState.NonQuotedField; 696 goto case CSVState.NonQuotedField; 697 } 698 699 case CSVState.NonQuotedField: 700 switch (nextChar) 701 { 702 default: 703 break OuterSwitch; 704 case csvDelim: 705 inputChunk[nextIndex] = tsvDelim; 706 csvState = CSVState.FieldEnd; 707 break OuterSwitch; 708 case LF: 709 if (recordNum == skipLines) flushCurrentRegion(nextIndex); 710 ++recordNum; 711 fieldNum = 0; 712 csvState = CSVState.FieldEnd; 713 break OuterSwitch; 714 case CR: 715 inputChunk[nextIndex] = LF; 716 if (recordNum == skipLines) flushCurrentRegion(nextIndex); 717 ++recordNum; 718 fieldNum = 0; 719 csvState = CSVState.CRAtFieldEnd; 720 break OuterSwitch; 721 case tsvDelim: 722 if (tsvDelimReplacement.length == 1) 723 { 724 inputChunk[nextIndex] = tsvDelimReplacement[0]; 725 } 726 else 727 { 728 flushCurrentRegion(nextIndex, tsvDelimReplacement); 729 } 730 break OuterSwitch; 731 } 732 733 case CSVState.QuotedField: 734 switch (nextChar) 735 { 736 default: 737 break OuterSwitch; 738 case csvQuote: 739 /* 740 * Flush the current region, without the double quote. Switch state 741 * to QuoteInQuotedField, which determines whether to output a quote. 742 */ 743 flushCurrentRegion(nextIndex); 744 csvState = CSVState.QuoteInQuotedField; 745 break OuterSwitch; 746 747 case tsvDelim: 748 if (tsvDelimReplacement.length == 1) 749 { 750 inputChunk[nextIndex] = tsvDelimReplacement[0]; 751 } 752 else 753 { 754 flushCurrentRegion(nextIndex, tsvDelimReplacement); 755 } 756 break OuterSwitch; 757 case LF: 758 /* Newline in a quoted field. */ 759 if (tsvNewlineReplacement.length == 1) 760 { 761 inputChunk[nextIndex] = tsvNewlineReplacement[0]; 762 } 763 else 764 { 765 flushCurrentRegion(nextIndex, tsvNewlineReplacement); 766 } 767 break OuterSwitch; 768 case CR: 769 /* Carriage Return in a quoted field. */ 770 if (tsvNewlineReplacement.length == 1) 771 { 772 inputChunk[nextIndex] = tsvNewlineReplacement[0]; 773 } 774 else 775 { 776 flushCurrentRegion(nextIndex, tsvNewlineReplacement); 777 } 778 csvState = CSVState.CRInQuotedField; 779 break OuterSwitch; 780 } 781 782 case CSVState.QuoteInQuotedField: 783 /* Just processed a quote in a quoted field. The buffer, without the 784 * quote, was just flushed. Only legal characters here are quote, 785 * comma (field delimiter), newline (record delimiter). 786 */ 787 switch (nextChar) 788 { 789 case csvQuote: 790 csvState = CSVState.QuotedField; 791 break OuterSwitch; 792 case csvDelim: 793 inputChunk[nextIndex] = tsvDelim; 794 csvState = CSVState.FieldEnd; 795 break OuterSwitch; 796 case LF: 797 if (recordNum == skipLines) flushCurrentRegion(nextIndex); 798 ++recordNum; 799 fieldNum = 0; 800 csvState = CSVState.FieldEnd; 801 break OuterSwitch; 802 case CR: 803 inputChunk[nextIndex] = LF; 804 if (recordNum == skipLines) flushCurrentRegion(nextIndex); 805 ++recordNum; 806 fieldNum = 0; 807 csvState = CSVState.CRAtFieldEnd; 808 break OuterSwitch; 809 default: 810 throw new Exception( 811 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 812 (filename == "-") ? "Standard Input" : filename, 813 recordNum)); 814 } 815 816 case CSVState.CRInQuotedField: 817 if (nextChar == LF) 818 { 819 flushCurrentRegion(nextIndex); 820 csvState = CSVState.QuotedField; 821 break OuterSwitch; 822 } 823 else { 824 /* Naked CR. State change only, don't consume current character. */ 825 csvState = CSVState.QuotedField; 826 goto case CSVState.QuotedField; 827 } 828 829 case CSVState.CRAtFieldEnd: 830 if (nextChar == LF) 831 { 832 flushCurrentRegion(nextIndex); 833 csvState = CSVState.FieldEnd; 834 break OuterSwitch; 835 } 836 else { 837 /* Naked CR. State change only, don't consume current character. */ 838 csvState = CSVState.FieldEnd; 839 goto case CSVState.FieldEnd; 840 } 841 } 842 } 843 844 /* End of buffer. */ 845 if (writeRegionStart < inputChunk.length && recordNum > skipLines) 846 { 847 outputStream.put(inputChunk[writeRegionStart .. $]); 848 } 849 850 writeRegionStart = 0; 851 } 852 853 enforce(csvState != CSVState.QuotedField, 854 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 855 (filename == "-") ? "Standard Input" : filename, 856 recordNum)); 857 858 /* Output a newline if the CSV input did not have a terminating newline. */ 859 if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n'); 860 } 861 862 unittest 863 { 864 /* Unit tests for the csv2tsv function. 865 * 866 * These unit tests exercise different CSV combinations and escaping cases. The CSV 867 * data content is the same for each corresponding test string, except the delimiters 868 * have been changed. e.g csv6a and csv6b have the same data content. 869 * 870 * A property used in these tests is that changing the CSV delimiters doesn't change 871 * the resulting TSV. However, changing the TSV delimiters will change the TSV result, 872 * as TSV doesn't support having it's delimiters in the data. This allows having a 873 * single TSV expected set that is generated by CSVs with different delimter sets. 874 * 875 * This test set does not test main, file handling, or error messages. These are 876 * handled by tests run against the executable. 877 * 878 * Note: unittest is non @safe due to the casts from string to ubyte[]. This can 879 * probably be rewritten to use std.string.representation instead, which is @safe. 880 */ 881 882 /* Default CSV. */ 883 auto csv1a = "a,b,c"; 884 auto csv2a = "a,bc,,,def"; 885 auto csv3a = ",a, b , cd ,"; 886 auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石"; 887 auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\""; 888 auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\""; 889 auto csv7a = "\",\",\",,\",\",,,\""; 890 auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\""; 891 auto csv9a = "\"ab, de\tfg\"\"\nhij\""; 892 auto csv10a = ""; 893 auto csv11a = ","; 894 auto csv12a = ",,"; 895 auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\""; 896 auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\""; 897 auto csv15a = "\"ab, de\tfg\"\"\rhij\""; 898 auto csv16a = "\"ab, de\tfg\"\"\r\nhij\""; 899 auto csv17a = "ab\",ab\"cd"; 900 auto csv18a = "\n\n\n"; 901 auto csv19a = "\t"; 902 auto csv20a = "\t\t"; 903 auto csv21a = "a\n"; 904 auto csv22a = "a,\n"; 905 auto csv23a = "a,b\n"; 906 auto csv24a = ",\n"; 907 auto csv25a = "#"; 908 auto csv26a = "^"; 909 auto csv27a = "#^#"; 910 auto csv28a = "^#^"; 911 auto csv29a = "$"; 912 auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n"; 913 auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n"; 914 auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\""; 915 916 // Newlines terminating a line ending a non-quoted field 917 auto csv33a = "\rX\r\nX\n\r\nX\r\n"; 918 919 // Newlines inside a quoted field and terminating a line following a quoted field 920 auto csv34a = "\"\r\",\"X\r\",\"X\rY\",\"\rY\"\r\"\r\n\",\"X\r\n\",\"X\r\nY\",\"\r\nY\"\r\n\"\n\",\"X\n\",\"X\nY\",\"\nY\"\n"; 921 922 // CR at field end 923 auto csv35a = "abc,def\r\"ghi\",\"jkl\"\r\"mno\",pqr\r"; 924 925 /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */ 926 auto csv1b = "a^b^c"; 927 auto csv2b = "a^bc^^^def"; 928 auto csv3b = "^a^ b ^ cd ^"; 929 auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石"; 930 auto csv5b = "#\n#^#\n\n#^#\n\n\n#"; 931 auto csv6b = "#\t#^#\t\t#^#\t\t\t#"; 932 auto csv7b = "#,#^#,,#^#,,,#"; 933 auto csv8b = "##^#\"#^#\"\"#"; 934 auto csv9b = "#ab, de\tfg\"\nhij#"; 935 auto csv10b = ""; 936 auto csv11b = "^"; 937 auto csv12b = "^^"; 938 auto csv13b = "#\r#^#\r\r#^#\r\r\r#"; 939 auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#"; 940 auto csv15b = "#ab, de\tfg\"\rhij#"; 941 auto csv16b = "#ab, de\tfg\"\r\nhij#"; 942 auto csv17b = "ab\"^ab\"cd"; 943 auto csv18b = "\n\n\n"; 944 auto csv19b = "\t"; 945 auto csv20b = "\t\t"; 946 auto csv21b = "a\n"; 947 auto csv22b = "a^\n"; 948 auto csv23b = "a^b\n"; 949 auto csv24b = "^\n"; 950 auto csv25b = "####"; 951 auto csv26b = "#^#"; 952 auto csv27b = "###^###"; 953 auto csv28b = "#^##^#"; 954 auto csv29b = "$"; 955 auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n"; 956 auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n"; 957 auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#"; 958 auto csv33b = "\rX\r\nX\n\r\nX\r\n"; 959 auto csv34b = "#\r#^#X\r#^#X\rY#^#\rY#\r#\r\n#^#X\r\n#^#X\r\nY#^#\r\nY#\r\n#\n#^#X\n#^#X\nY#^#\nY#\n"; 960 auto csv35b = "abc^def\r#ghi#^#jkl#\r#mno#^pqr\r"; 961 962 /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/ 963 auto tsv1 = "a\tb\tc\n"; 964 auto tsv2 = "a\tbc\t\t\tdef\n"; 965 auto tsv3 = "\ta\t b \t cd \t\n"; 966 auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 967 auto tsv5 = " \t \t \n"; 968 auto tsv6 = " \t \t \n"; 969 auto tsv7 = ",\t,,\t,,,\n"; 970 auto tsv8 = "\t\"\t\"\"\n"; 971 auto tsv9 = "ab, de fg\" hij\n"; 972 auto tsv10 = ""; 973 auto tsv11 = "\t\n"; 974 auto tsv12 = "\t\t\n"; 975 auto tsv13 = " \t \t \n"; 976 auto tsv14 = " \t \t \n"; 977 auto tsv15 = "ab, de fg\" hij\n"; 978 auto tsv16 = "ab, de fg\" hij\n"; 979 auto tsv17 = "ab\"\tab\"cd\n"; 980 auto tsv18 = "\n\n\n"; 981 auto tsv19 = " \n"; 982 auto tsv20 = " \n"; 983 auto tsv21 = "a\n"; 984 auto tsv22 = "a\t\n"; 985 auto tsv23 = "a\tb\n"; 986 auto tsv24 = "\t\n"; 987 auto tsv25 = "#\n"; 988 auto tsv26 = "^\n"; 989 auto tsv27 = "#^#\n"; 990 auto tsv28 = "^#^\n"; 991 auto tsv29 = "$\n"; 992 auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 993 auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 994 auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 995 auto tsv33 = "\nX\nX\n\nX\n"; 996 auto tsv34 = " \tX \tX Y\t Y\n \tX \tX Y\t Y\n \tX \tX Y\t Y\n"; 997 auto tsv35 = "abc\tdef\nghi\tjkl\nmno\tpqr\n"; 998 999 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab. 1000 * This will also result in different replacements when TAB and $ appear in the CSV. 1001 */ 1002 auto tsv1_x = "a$b$c\n"; 1003 auto tsv2_x = "a$bc$$$def\n"; 1004 auto tsv3_x = "$a$ b $ cd $\n"; 1005 auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 1006 auto tsv5_x = " $ $ \n"; 1007 auto tsv6_x = "\t$\t\t$\t\t\t\n"; 1008 auto tsv7_x = ",$,,$,,,\n"; 1009 auto tsv8_x = "$\"$\"\"\n"; 1010 auto tsv9_x = "ab, de\tfg\" hij\n"; 1011 auto tsv10_x = ""; 1012 auto tsv11_x = "$\n"; 1013 auto tsv12_x = "$$\n"; 1014 auto tsv13_x = " $ $ \n"; 1015 auto tsv14_x = " $ $ \n"; 1016 auto tsv15_x = "ab, de\tfg\" hij\n"; 1017 auto tsv16_x = "ab, de\tfg\" hij\n"; 1018 auto tsv17_x = "ab\"$ab\"cd\n"; 1019 auto tsv18_x = "\n\n\n"; 1020 auto tsv19_x = "\t\n"; 1021 auto tsv20_x = "\t\t\n"; 1022 auto tsv21_x = "a\n"; 1023 auto tsv22_x = "a$\n"; 1024 auto tsv23_x = "a$b\n"; 1025 auto tsv24_x = "$\n"; 1026 auto tsv25_x = "#\n"; 1027 auto tsv26_x = "^\n"; 1028 auto tsv27_x = "#^#\n"; 1029 auto tsv28_x = "^#^\n"; 1030 auto tsv29_x = " \n"; 1031 auto tsv30_x = " $ \n $ $ \n^# $ #^$# ^$^ #\n"; 1032 auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 1033 auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 1034 auto tsv33_x = "\nX\nX\n\nX\n"; 1035 auto tsv34_x = " $X $X Y$ Y\n $X $X Y$ Y\n $X $X Y$ Y\n"; 1036 auto tsv35_x = "abc$def\nghi$jkl\nmno$pqr\n"; 1037 1038 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab, 1039 * and with the delimiter/newline replacement string being |--|. Basically, newlines 1040 * and '$' in the original data are replaced by |--|. 1041 */ 1042 auto tsv1_y = "a$b$c\n"; 1043 auto tsv2_y = "a$bc$$$def\n"; 1044 auto tsv3_y = "$a$ b $ cd $\n"; 1045 auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 1046 auto tsv5_y = "|--|$|--||--|$|--||--||--|\n"; 1047 auto tsv6_y = "\t$\t\t$\t\t\t\n"; 1048 auto tsv7_y = ",$,,$,,,\n"; 1049 auto tsv8_y = "$\"$\"\"\n"; 1050 auto tsv9_y = "ab, de\tfg\"|--|hij\n"; 1051 auto tsv10_y = ""; 1052 auto tsv11_y = "$\n"; 1053 auto tsv12_y = "$$\n"; 1054 auto tsv13_y = "|--|$|--||--|$|--||--||--|\n"; 1055 auto tsv14_y = "|--|$|--||--|$|--||--||--|\n"; 1056 auto tsv15_y = "ab, de\tfg\"|--|hij\n"; 1057 auto tsv16_y = "ab, de\tfg\"|--|hij\n"; 1058 auto tsv17_y = "ab\"$ab\"cd\n"; 1059 auto tsv18_y = "\n\n\n"; 1060 auto tsv19_y = "\t\n"; 1061 auto tsv20_y = "\t\t\n"; 1062 auto tsv21_y = "a\n"; 1063 auto tsv22_y = "a$\n"; 1064 auto tsv23_y = "a$b\n"; 1065 auto tsv24_y = "$\n"; 1066 auto tsv25_y = "#\n"; 1067 auto tsv26_y = "^\n"; 1068 auto tsv27_y = "#^#\n"; 1069 auto tsv28_y = "^#^\n"; 1070 auto tsv29_y = "|--|\n"; 1071 auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n"; 1072 auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 1073 auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 1074 auto tsv33_y = "\nX\nX\n\nX\n"; 1075 auto tsv34_y = "|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n"; 1076 auto tsv35_y = "abc$def\nghi$jkl\nmno$pqr\n"; 1077 1078 /* The TSV results for CSV sets 1a and 1b, but with the TAB replacement as |TAB| 1079 * and newline replacement |NL|. 1080 */ 1081 auto tsv1_z = "a\tb\tc\n"; 1082 auto tsv2_z = "a\tbc\t\t\tdef\n"; 1083 auto tsv3_z = "\ta\t b \t cd \t\n"; 1084 auto tsv4_z = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 1085 auto tsv5_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n"; 1086 auto tsv6_z = "<TAB>\t<TAB><TAB>\t<TAB><TAB><TAB>\n"; 1087 auto tsv7_z = ",\t,,\t,,,\n"; 1088 auto tsv8_z = "\t\"\t\"\"\n"; 1089 auto tsv9_z = "ab, de<TAB>fg\"<NL>hij\n"; 1090 auto tsv10_z = ""; 1091 auto tsv11_z = "\t\n"; 1092 auto tsv12_z = "\t\t\n"; 1093 auto tsv13_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n"; 1094 auto tsv14_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n"; 1095 auto tsv15_z = "ab, de<TAB>fg\"<NL>hij\n"; 1096 auto tsv16_z = "ab, de<TAB>fg\"<NL>hij\n"; 1097 auto tsv17_z = "ab\"\tab\"cd\n"; 1098 auto tsv18_z = "\n\n\n"; 1099 auto tsv19_z = "<TAB>\n"; 1100 auto tsv20_z = "<TAB><TAB>\n"; 1101 auto tsv21_z = "a\n"; 1102 auto tsv22_z = "a\t\n"; 1103 auto tsv23_z = "a\tb\n"; 1104 auto tsv24_z = "\t\n"; 1105 auto tsv25_z = "#\n"; 1106 auto tsv26_z = "^\n"; 1107 auto tsv27_z = "#^#\n"; 1108 auto tsv28_z = "^#^\n"; 1109 auto tsv29_z = "$\n"; 1110 auto tsv30_z = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 1111 auto tsv31_z = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 1112 auto tsv32_z = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 1113 auto tsv33_z = "\nX\nX\n\nX\n"; 1114 auto tsv34_z = "<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n"; 1115 auto tsv35_z = "abc\tdef\nghi\tjkl\nmno\tpqr\n"; 1116 1117 /* Aggregate the test data into parallel arrays. */ 1118 auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a, 1119 csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a, 1120 csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a, 1121 csv31a, csv32a, csv33a, csv34a, csv35a]; 1122 1123 auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b, 1124 csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b, 1125 csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b, 1126 csv31b, csv32b, csv33b, csv34b, csv35b]; 1127 1128 auto tsvSet1 = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10, 1129 tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20, 1130 tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30, 1131 tsv31, tsv32, tsv33, tsv34, tsv35]; 1132 1133 auto tsvSet1_x = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x, 1134 tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x, 1135 tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x, 1136 tsv31_x, tsv32_x, tsv33_x, tsv34_x, tsv35_x]; 1137 1138 auto tsvSet1_y = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y, 1139 tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y, 1140 tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y, 1141 tsv31_y, tsv32_y, tsv33_y, tsv34_y, tsv35_y]; 1142 1143 auto tsvSet1_z = [tsv1_z, tsv2_z, tsv3_z, tsv4_z, tsv5_z, tsv6_z, tsv7_z, tsv8_z, tsv9_z, tsv10_z, 1144 tsv11_z, tsv12_z, tsv13_z, tsv14_z, tsv15_z, tsv16_z, tsv17_z, tsv18_z, tsv19_z, tsv20_z, 1145 tsv21_z, tsv22_z, tsv23_z, tsv24_z, tsv25_z, tsv26_z, tsv27_z, tsv28_z, tsv29_z, tsv30_z, 1146 tsv31_z, tsv32_z, tsv33_z, tsv34_z, tsv35_z]; 1147 1148 /* The tests. */ 1149 auto bufferSizeTests = [1, 2, 3, 8, 128]; 1150 1151 foreach (bufferSize; bufferSizeTests) 1152 { 1153 ubyte[] readBuffer = new ubyte[](bufferSize); 1154 1155 foreach (i, csva, csvb, tsv, tsv_x, tsv_y, tsv_z; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y, tsvSet1_z)) 1156 { 1157 import std.conv : to; 1158 1159 /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 1160 ubyte[] csvInputA = cast(ubyte[])csva; 1161 ubyte[] csvInputB = cast(ubyte[])csvb; 1162 1163 /* CSV Set A vs TSV expected. */ 1164 auto tsvResultA = appender!(char[])(); 1165 csv2tsv(csvInputA, tsvResultA, readBuffer, "csvInputA_defaultTSV"); 1166 assert(tsv == tsvResultA.data, 1167 format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1168 i + 1, csva, tsv, tsvResultA.data)); 1169 1170 /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/ 1171 auto tsvResultB = appender!(char[])(); 1172 csv2tsv(csvInputB, tsvResultB, readBuffer, "csvInputB_defaultTSV", 0, '#', '^'); 1173 assert(tsv == tsvResultB.data, 1174 format("Unittest failure. tsv != tsvResultB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1175 i + 1, csvb, tsv, tsvResultB.data)); 1176 1177 /* CSV Set A and TSV with $ separator.*/ 1178 csvInputA = cast(ubyte[])csva; 1179 auto tsvResult_XA = appender!(char[])(); 1180 csv2tsv(csvInputA, tsvResult_XA, readBuffer, "csvInputA_TSV_WithDollarDelimiter", 0, '"', ',', '$'); 1181 assert(tsv_x == tsvResult_XA.data, 1182 format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1183 i + 1, csva, tsv_x, tsvResult_XA.data)); 1184 1185 /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/ 1186 csvInputB = cast(ubyte[])csvb; 1187 auto tsvResult_XB = appender!(char[])(); 1188 csv2tsv(csvInputB, tsvResult_XB, readBuffer, "csvInputB__TSV_WithDollarDelimiter", 0, '#', '^', '$'); 1189 assert(tsv_x == tsvResult_XB.data, 1190 format("Unittest failure. tsv_x != tsvResult_XB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1191 i + 1, csvb, tsv_x, tsvResult_XB.data)); 1192 1193 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */ 1194 csvInputA = cast(ubyte[])csva; 1195 auto tsvResult_YA = appender!(char[])(); 1196 csv2tsv(csvInputA, tsvResult_YA, readBuffer, "csvInputA_TSV_WithDollarAndDelimReplacement", 0, '"', ',', '$', "|--|", "|--|"); 1197 assert(tsv_y == tsvResult_YA.data, 1198 format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1199 i + 1, csva, tsv_y, tsvResult_YA.data)); 1200 1201 /* CSV Set B and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/ 1202 csvInputB = cast(ubyte[])csvb; 1203 auto tsvResult_YB = appender!(char[])(); 1204 csv2tsv(csvInputB, tsvResult_YB, readBuffer, "csvInputB__TSV_WithDollarAndDelimReplacement", 0, '#', '^', '$', "|--|", "|--|"); 1205 assert(tsv_y == tsvResult_YB.data, 1206 format("Unittest failure. tsv_y != tsvResult_YB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1207 i + 1, csvb, tsv_y, tsvResult_YB.data)); 1208 1209 /* CSV Set A and TSV with TAB replacement as <TAB> and newline replacement as <NL>. Same TSV as CSV Set A.*/ 1210 csvInputA = cast(ubyte[])csva; 1211 auto tsvResult_ZA = appender!(char[])(); 1212 csv2tsv(csvInputA, tsvResult_ZA, readBuffer, "csvInputA_TSV_WithDifferentTABandNLReplacements", 0, '"', ',', '\t', "<TAB>", "<NL>"); 1213 assert(tsv_z == tsvResult_ZA.data, 1214 format("Unittest failure. tsv_z != tsvResult_ZA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1215 i + 1, csva, tsv_z, tsvResult_ZA.data)); 1216 } 1217 } 1218 } 1219 1220 // csv2tsv skiplines tests 1221 unittest 1222 { 1223 import std.string : representation; 1224 1225 auto csv1 = ""; 1226 auto csv2 = "a"; 1227 1228 auto csv3 = "\n"; 1229 auto csv4 = "\n\n"; 1230 auto csv5 = "\n\n\n"; 1231 1232 auto csv6 = "a\n"; 1233 auto csv7 = "a\nb\n"; 1234 auto csv8 = "a\nb\nc\n"; 1235 1236 auto csv9 = "\"\n\"\n"; 1237 auto csv10 = "\"\n\"\n\"\n\"\n"; 1238 auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n"; 1239 1240 auto csv12 = "\r"; 1241 auto csv13 = "\r\r"; 1242 auto csv14 = "\r\r\r"; 1243 1244 auto csv15 = "a\r"; 1245 auto csv16 = "a\rb\r"; 1246 auto csv17 = "a\rb\rc\r"; 1247 1248 auto csv18 = "\"\r\"\r"; 1249 auto csv19 = "\"\r\"\r\"\r\"\r"; 1250 auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r"; 1251 1252 auto csv21 = "\r\n"; 1253 auto csv22 = "\r\n\r\n"; 1254 auto csv23 = "\r\n\r\n\r\n"; 1255 1256 auto csv24 = "a\r\n"; 1257 auto csv25 = "a\r\nb\r\n"; 1258 auto csv26 = "a\r\nb\r\nc\r\n"; 1259 1260 auto csv27 = "\"\r\n\"\r\n"; 1261 auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n"; 1262 auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n"; 1263 1264 /* The Skip 1 expected results. */ 1265 auto tsv1Skip1 = ""; 1266 auto tsv2Skip1 = ""; 1267 1268 auto tsv3Skip1 = ""; 1269 auto tsv4Skip1 = "\n"; 1270 auto tsv5Skip1 = "\n\n"; 1271 1272 auto tsv6Skip1 = ""; 1273 auto tsv7Skip1 = "b\n"; 1274 auto tsv8Skip1 = "b\nc\n"; 1275 1276 auto tsv9Skip1 = ""; 1277 auto tsv10Skip1 = " \n"; 1278 auto tsv11Skip1 = " \n \n"; 1279 1280 auto tsv12Skip1 = ""; 1281 auto tsv13Skip1 = "\n"; 1282 auto tsv14Skip1 = "\n\n"; 1283 1284 auto tsv15Skip1 = ""; 1285 auto tsv16Skip1 = "b\n"; 1286 auto tsv17Skip1 = "b\nc\n"; 1287 1288 auto tsv18Skip1 = ""; 1289 auto tsv19Skip1 = " \n"; 1290 auto tsv20Skip1 = " \n \n"; 1291 1292 auto tsv21Skip1 = ""; 1293 auto tsv22Skip1 = "\n"; 1294 auto tsv23Skip1 = "\n\n"; 1295 1296 auto tsv24Skip1 = ""; 1297 auto tsv25Skip1 = "b\n"; 1298 auto tsv26Skip1 = "b\nc\n"; 1299 1300 auto tsv27Skip1 = ""; 1301 auto tsv28Skip1 = " \n"; 1302 auto tsv29Skip1 = " \n \n"; 1303 1304 /* The Skip 2 expected results. */ 1305 auto tsv1Skip2 = ""; 1306 auto tsv2Skip2 = ""; 1307 1308 auto tsv3Skip2 = ""; 1309 auto tsv4Skip2 = ""; 1310 auto tsv5Skip2 = "\n"; 1311 1312 auto tsv6Skip2 = ""; 1313 auto tsv7Skip2 = ""; 1314 auto tsv8Skip2 = "c\n"; 1315 1316 auto tsv9Skip2 = ""; 1317 auto tsv10Skip2 = ""; 1318 auto tsv11Skip2 = " \n"; 1319 1320 auto tsv12Skip2 = ""; 1321 auto tsv13Skip2 = ""; 1322 auto tsv14Skip2 = "\n"; 1323 1324 auto tsv15Skip2 = ""; 1325 auto tsv16Skip2 = ""; 1326 auto tsv17Skip2 = "c\n"; 1327 1328 auto tsv18Skip2 = ""; 1329 auto tsv19Skip2 = ""; 1330 auto tsv20Skip2 = " \n"; 1331 1332 auto tsv21Skip2 = ""; 1333 auto tsv22Skip2 = ""; 1334 auto tsv23Skip2 = "\n"; 1335 1336 auto tsv24Skip2 = ""; 1337 auto tsv25Skip2 = ""; 1338 auto tsv26Skip2 = "c\n"; 1339 1340 auto tsv27Skip2 = ""; 1341 auto tsv28Skip2 = ""; 1342 auto tsv29Skip2 = " \n"; 1343 1344 auto csvSet = 1345 [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10, 1346 csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20, 1347 csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29]; 1348 1349 auto tsvSkip1Set = 1350 [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1, 1351 tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1, 1352 tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1]; 1353 1354 auto tsvSkip2Set = 1355 [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2, 1356 tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2, 1357 tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2]; 1358 1359 auto bufferSizeTests = [1, 2, 3, 4, 8, 128]; 1360 1361 foreach (bufferSize; bufferSizeTests) 1362 { 1363 ubyte[] readBuffer = new ubyte[](bufferSize); 1364 1365 foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set)) 1366 { 1367 ubyte[] csvInput = csv.dup.representation; 1368 auto csvToTSVSkip1 = appender!(char[])(); 1369 auto csvToTSVSkip2 = appender!(char[])(); 1370 1371 csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1); 1372 1373 assert(tsvSkip1 == csvToTSVSkip1.data, 1374 format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1375 i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data)); 1376 1377 csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2); 1378 1379 assert(tsvSkip2 == csvToTSVSkip2.data, 1380 format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1381 i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data)); 1382 } 1383 } 1384 } 1385 1386 // csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe 1387 unittest 1388 { 1389 import std.conv : hexString; 1390 import std.string : representation; 1391 1392 enum utf8BOM = hexString!"efbbbf"; 1393 1394 auto csv1 = ""; 1395 auto csv2 = "a"; 1396 auto csv3 = "ab"; 1397 auto csv4 = "a,b"; 1398 auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz"; 1399 1400 auto csv1BOM = utf8BOM ~ csv1; 1401 auto csv2BOM = utf8BOM ~ csv2; 1402 auto csv3BOM = utf8BOM ~ csv3; 1403 auto csv4BOM = utf8BOM ~ csv4; 1404 auto csv5BOM = utf8BOM ~ csv5; 1405 1406 auto tsv1 = ""; 1407 auto tsv2 = "a\n"; 1408 auto tsv3 = "ab\n"; 1409 auto tsv4 = "a\tb\n"; 1410 auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n"; 1411 1412 /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline. 1413 * However, with the BOM prepended the tsv gets a trailing newline. 1414 */ 1415 auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n"; 1416 auto tsv2BOM = utf8BOM ~ tsv2; 1417 auto tsv3BOM = utf8BOM ~ tsv3; 1418 auto tsv4BOM = utf8BOM ~ tsv4; 1419 auto tsv5BOM = utf8BOM ~ tsv5; 1420 1421 auto csvSet = [csv1, csv2, csv3, csv4, csv5]; 1422 auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM]; 1423 1424 auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5]; 1425 auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM]; 1426 1427 auto bufferSizeTests = [1, 2, 3, 4, 8, 128]; 1428 1429 foreach (bufferSize; bufferSizeTests) 1430 { 1431 ubyte[] readBuffer = new ubyte[](bufferSize); 1432 1433 foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet)) 1434 { 1435 ubyte[] csvInput = csv.dup.representation; 1436 ubyte[] csvBOMInput = csvBOM.dup.representation; 1437 1438 auto csvToTSV = appender!(char[])(); 1439 auto csvToTSV_NoBOMRemoval = appender!(char[])(); 1440 auto csvBOMToTSV = appender!(char[])(); 1441 auto csvBOMToTSV_NoBOMRemoval = appender!(char[])(); 1442 1443 csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true); 1444 assert(tsv == csvToTSV.data, 1445 format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1446 i + 1, bufferSize, csv, tsv, csvToTSV.data)); 1447 1448 csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false); 1449 assert(tsv == csvToTSV_NoBOMRemoval.data, 1450 format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1451 i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data)); 1452 1453 csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true); 1454 if (readBuffer.length < utf8BOM.length) 1455 { 1456 /* Removing BOMs, but didn't provide enough buffer, so no removal. */ 1457 assert(tsvBOM == csvBOMToTSV.data, 1458 format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1459 i + 1, bufferSize, csv, tsv, csvBOMToTSV.data)); 1460 } 1461 else 1462 { 1463 assert(tsv == csvBOMToTSV.data, 1464 format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1465 i + 1, bufferSize, csv, tsv, csvBOMToTSV.data)); 1466 } 1467 1468 csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false); 1469 assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data, 1470 format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1471 i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data)); 1472 } 1473 } 1474 }