1 /** 2 Convert CSV formatted data to TSV format. 3 4 This program converts comma-separated value data to tab-separated format. 5 6 Copyright (c) 2016-2021, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 12 module tsv_utils.csv2tsv; 13 14 import std.stdio; 15 import std.exception : enforce; 16 import std.format : format; 17 import std.range; 18 import std.traits : isArray, Unqual; 19 import std.typecons : tuple; 20 import tsv_utils.common.utils : isBufferableInputSource, inputSourceByChunk; 21 22 immutable helpText = q"EOS 23 Synopsis: csv2tsv [options] [file...] 24 25 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records 26 are read from files or standard input, converted records written to standard output. 27 Use '--help-verbose' for details the CSV formats accepted. 28 29 Options: 30 EOS"; 31 32 immutable helpTextVerbose = q"EOS 33 Synopsis: csv2tsv [options] [file...] 34 35 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records 36 are read from files or standard input, converted records written to standard output. 37 38 Both formats represent tabular data, each record on its own line, fields separated 39 by a delimiter character. The key difference is that CSV uses escape sequences to 40 represent newlines and field separators in the data, whereas TSV disallows these 41 characters in the data. The most common field delimiters are comma for CSV and tab 42 for TSV, but any character can be used. 43 44 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, 45 and replacing newlines and field delimiters in the data. By default, newlines and 46 field delimiters in the data are replaced by spaces. Most details are customizable. 47 48 There is no single spec for CSV, any number of variants can be found. The escape 49 syntax is common enough: fields containing newlines or field delimiters are placed 50 in double quotes. Inside a quoted field, a double quote is represented by a pair of 51 double quotes. As with field separators, the quoting character is customizable. 52 53 Behaviors of this program that often vary between CSV implementations: 54 * Newlines are supported in quoted fields. 55 * Double quotes are permitted in a non-quoted field. However, a field starting 56 with a quote must follow quoting rules. 57 * Each record can have a different number of fields. 58 * The three common forms of newlines are supported: CR, CRLF, LF. Output is 59 written using Unix newlines (LF). 60 * A newline will be added if the file does not end with one. 61 * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed. 62 * No whitespace trimming is done. 63 64 This program does not validate CSV correctness, but will terminate with an error 65 upon reaching an inconsistent state. Improperly terminated quoted fields are the 66 primary cause. 67 68 UTF-8 input is assumed. Convert other encodings prior to invoking this tool. 69 70 Options: 71 EOS"; 72 73 /** Container for command line options. 74 */ 75 struct Csv2tsvOptions 76 { 77 string programName; 78 bool helpVerbose = false; // --help-verbose 79 bool hasHeader = false; // --H|header 80 char csvQuoteChar = '"'; // --q|quote 81 char csvDelimChar = ','; // --c|csv-delim 82 char tsvDelimChar = '\t'; // --t|tsv-delim 83 string tsvDelimReplacement = " "; // --r|tab-replacement 84 string newlineReplacement = " "; // --n|newline-replacement 85 bool versionWanted = false; // --V|version 86 87 auto processArgs (ref string[] cmdArgs) 88 { 89 import std.algorithm : canFind; 90 import std.getopt; 91 import std.path : baseName, stripExtension; 92 93 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 94 95 try 96 { 97 auto r = getopt( 98 cmdArgs, 99 "help-verbose", " Print full help.", &helpVerbose, 100 std.getopt.config.caseSensitive, 101 "H|header", " Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader, 102 std.getopt.config.caseSensitive, 103 "q|quote", "CHR Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar, 104 "c|csv-delim", "CHR Field delimiter in CSV data. Default: comma (,).", &csvDelimChar, 105 "t|tsv-delim", "CHR Field delimiter in TSV data. Default: TAB", &tsvDelimChar, 106 "r|tab-replacement", "STR Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space.", &tsvDelimReplacement, 107 "n|newline-replacement", "STR Replacement for newlines found in CSV input. Default: Space.", &newlineReplacement, 108 std.getopt.config.caseSensitive, 109 "V|version", " Print version information and exit.", &versionWanted, 110 std.getopt.config.caseInsensitive, 111 ); 112 113 if (r.helpWanted) 114 { 115 defaultGetoptPrinter(helpText, r.options); 116 return tuple(false, 0); 117 } 118 else if (helpVerbose) 119 { 120 defaultGetoptPrinter(helpTextVerbose, r.options); 121 return tuple(false, 0); 122 } 123 else if (versionWanted) 124 { 125 import tsv_utils.common.tsvutils_version; 126 writeln(tsvutilsVersionNotice("csv2tsv")); 127 return tuple(false, 0); 128 } 129 130 /* Consistency checks. */ 131 enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r', 132 "CSV quote character cannot be newline (--q|quote)."); 133 134 enforce(csvQuoteChar != csvDelimChar, 135 "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim)."); 136 137 enforce(csvQuoteChar != tsvDelimChar, 138 "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim)."); 139 140 enforce(csvDelimChar != '\n' && csvDelimChar != '\r', 141 "CSV field delimiter cannot be newline (--c|csv-delim)."); 142 143 enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r', 144 "TSV field delimiter cannot be newline (--t|tsv-delim)."); 145 146 enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement), 147 "Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement)."); 148 149 enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(newlineReplacement), 150 "Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement)."); 151 } 152 catch (Exception exc) 153 { 154 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 155 return tuple(false, 1); 156 } 157 return tuple(true, 0); 158 } 159 } 160 161 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 162 163 version(unittest) 164 { 165 // No main in unittest 166 } 167 else 168 { 169 int main(string[] cmdArgs) 170 { 171 /* When running in DMD code coverage mode, turn on report merging. */ 172 version(D_Coverage) version(DigitalMars) 173 { 174 import core.runtime : dmd_coverSetMerge; 175 dmd_coverSetMerge(true); 176 } 177 178 Csv2tsvOptions cmdopt; 179 const r = cmdopt.processArgs(cmdArgs); 180 if (!r[0]) return r[1]; 181 version(LDC_Profile) 182 { 183 import ldc.profile : resetAll; 184 resetAll(); 185 } 186 try csv2tsvFiles(cmdopt, cmdArgs[1..$]); 187 catch (Exception exc) 188 { 189 writeln(); 190 stdout.flush(); 191 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 192 return 1; 193 } 194 195 return 0; 196 } 197 } 198 199 /** csv2tsvFiles takes a list of input files and passes each to csv2tsv, which 200 * runs on a single file. csv2tsvFiles manages header lines and sets up the 201 * BufferedOutputRange passed to csv2tsv. 202 */ 203 void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles) 204 { 205 import tsv_utils.common.utils : BufferedOutputRange; 206 207 /* Buffer Sizes 208 * 209 * ReadBufferSize is the typical size used for buffered reads by most tsv-utils 210 * programs. Nothing unusal there. However, the default sizes used by 211 * BufferedOutputRange are overridden to allocate a larger initial buffer (the 212 * reserve size) and to ensure buffers are flushed to standard output more 213 * quickly (the max size). 214 * 215 * BufferedOutputRange is intended primarily for record oriented writes, where 216 * output ends in newlines. When given a string ending in a newline, the buffer 217 * is flushed if it is greater than 'flush size'. Otherwise buffers are flushed 218 * after exceeding 'max size'. 219 * 220 * For csv2tsv's buffered conversion algorithm there are two very different cases: 221 * 1) Extensive use of CSV escapes, where all fields are quoted. 222 * 2) Limited use of CSV escapes, where few fields are quoted. 223 * 224 * The first case will translate to record oriented writes. In particular, if the 225 * first field is quoted, the write to BufferedOutputRange will be on a newline 226 * boundary. (A quoted field pushes accumulated data to BufferedOutputRange.) For 227 * this case, the default flush behavior of BufferedOutputRange works well. 228 * 229 * In the second case, data gets pushed to BufferedOutputRange on arbitrary byte 230 * boundaries. BufferedOutputRange won't flush to standard output until max size 231 * bytes have been accumulated. The default max size is larger than optimal, so 232 * instead max size is set to a size similar to the read buffer size. Reserve 233 * is increased for the same reason. 234 */ 235 enum ReadBufferSize = 1024L * 128L; 236 enum OutputBufferFlushSize = 1024L * 10L; 237 enum OutputBufferReserveSize = 1024L * 129L; 238 enum OutputBufferMaxSize = 1024L * 128L; 239 240 ubyte[ReadBufferSize] fileRawBuf; 241 auto stdoutWriter = BufferedOutputRange!(typeof(stdout))( 242 stdout, OutputBufferFlushSize, OutputBufferReserveSize, OutputBufferMaxSize); 243 bool firstFile = true; 244 245 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 246 { 247 auto inputStream = (filename == "-") ? stdin : filename.File; 248 auto printFileName = (filename == "-") ? "stdin" : filename; 249 250 auto skipLines = (firstFile || !cmdopt.hasHeader) ? 0 : 1; 251 252 csv2tsv(inputStream, stdoutWriter, fileRawBuf, printFileName, skipLines, 253 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 254 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement, 255 cmdopt.newlineReplacement); 256 257 firstFile = false; 258 } 259 } 260 261 /* csv2tsv buffered conversion algorithm 262 263 This version of csv2tsv uses a buffered approach to csv-to-tsv conversion. This is a 264 change from the original version, which used a character-at-a-time approach, with 265 characters coming from an infinite stream of characters. The character-at-a-time 266 approach was nice from a simplicity perspective, but the approach didn't optimize well. 267 Note that the original version read input in blocks and wrote to stdout in blocks, it 268 was the conversion algorithm itself that was character oriented. 269 270 The idea is to convert a buffer at a time, writing larger blocks to the output stream 271 rather than one character at a time. In addition, the read buffer is modified in-place 272 when the only change is to convert a single character. The notable case is converting 273 the field delimiter character, typically comma to TAB. The result is writing longer 274 blocks to the output stream (BufferedOutputRange). 275 276 Performance improvements from the new algorithm are notable. This is especially true 277 versus the previous version 2.0.0. Note though that the more recent versions of 278 csv2tsv were slower due to degradations coming from compiler and/or language version. 279 Version 1.1.19 was quite a bit faster. Regardless of version, the performance 280 improvement is especially good when run against "simple" CSV files, with limited 281 amounts of CSV escape syntax. In these files the main change is converting the field 282 delimiter character, typically comma to TAB. 283 284 In some benchmarks on Mac OS, the new version was 40% faster than csv2tsv 2.0.0 on 285 files with significant CSV escapes, and 60% faster on files with limited CSV escapes. 286 Versus csv2tsv version 1.1.19, the new version is 10% and 40% faster on the same 287 files. On the "simple CSV" file, where Unix 'tr' is an option, 'tr' was still faster, 288 by about 20%. But getting into the 'tr' ballpark while retaining safety of correct 289 csv2tsv conversion is a good result. 290 291 Algorithm notes: 292 293 The algorithm works by reading an input block, then examining each byte in-order to 294 identify needed modifications. The region of consecutive characters without a change 295 is tracked. Single character changes are done in-place, in the read buffer. This 296 allows assembling longer blocks before write is needed. The region being tracked is 297 written to the output stream when it can no longer be extended in a continuous 298 fashion. At this point a new region is started. When the current read buffer has 299 been processed the current region is written out and a new block of data read in. 300 301 The read buffer uses fixed size blocks. This means the algorithm is actually 302 operating on bytes (UTF-8 code units), and not characters. This works because all 303 delimiters and CSV escape syntax characters are single byte UTF-8 characters. These 304 are the only characters requiring interpretation. The main nuisance is the 2-byte 305 CRLF newline sequence, as this might be split across two read buffers. This is 306 handled by embedding 'CR' states in the finite state machine. 307 308 Processing CSV escapes will often cause the character removals and additions. These 309 will not be representable in a continuous stream of bytes without moving bytes around 310 Instead of moving bytes, these cases are handled by immediately writing to the output 311 stream. This allows restarting a new block of contiguous characters. Handling by the 312 new algorithm is described below. Note that the length of the replacement characters 313 for TSV field and record delimiters (e.g. TAB, newline) affects the processing. 314 315 All replacement character lengths: 316 317 * Windows newline (CRLF) at the end of a line - Replace the CRLF with LF. 318 319 Replace the CR with LF, add it to the current write region and terminate it. The 320 next write region starts at the character after the LF. 321 322 * Double quote starting or ending a field - Drop the double quote. 323 324 Terminate the current write region, next write region starts at the next character. 325 326 * Double quote pair inside a quoted field - Drop one of the double quotes. 327 328 The algorithm drops the first double quote and keep the second. This avoids 329 look-ahead and both field terminating double quote and double quote pair can 330 handled the same way. Terminate the current write region without adding the double 331 quote. The next write region starts at the next character. 332 333 Single byte replacement characters: 334 335 * Windows newline (CRLF) in a quoted field 336 337 Replace the CR with the replacement char, add it to the current write region and 338 terminate it. The next write region starts at the character after the LF. 339 340 Multi-byte replacement sequences: 341 342 * TSV Delimiter (TAB by default) in a field 343 344 Terminate the current write region, write it out and the replacement. The next 345 write region starts at the next character. 346 347 * LF, CR, or CRLF in a quoted field 348 349 Terminate the current write region, write it and the replacement. The next write 350 region starts at the next character. 351 352 csv2tsv API 353 354 At the API level, it is desirable to handle at both open files and input streams. 355 Open files are the key requirement, but handling input streams simplifies unit 356 testing, and in-memory conversion is likely to be useful anyway. Internally, it 357 should be easy enough to encapsulate the differences between input streams and files. 358 Reading files can be done using File.byChunk and reading from input streams can be 359 done using std.range.chunks. 360 361 This has been handled by creating a new range that can iterate either files or 362 input streams chunk-by-chunk. 363 */ 364 365 /** Read CSV from an input source, covert to TSV and write to an output source. 366 * 367 * Params: 368 * inputSource = A "bufferable" input source, either a file open for 369 * read or an input range with ubyte elements. 370 * outputStream = An output range to write TSV bytes to. 371 * readBuffer = A buffer to use for reading. 372 * filename = Name of file to use when reporting errors. A descriptive 373 * name can be used in lieu of a file name. 374 * skipLines = Number of lines to skip before outputting records. 375 * Typically used to skip writing header lines. 376 * csvQuote = The quoting character used in the CSV input. 377 * csvDelim = The field delimiter character used in the CSV input. 378 * tsvDelim = The field delimiter character to use in the TSV output. 379 * tsvDelimReplacement = String to use when replacing TSV field delimiters 380 * (e.g. TABs) found in the CSV data fields. 381 * tsvNewlineReplacement = String to use when replacing newlines found in the CSV 382 * data fields. 383 * discardBOM = If true (the default), a UTF-8 Byte Order Mark found at the 384 * start of the input stream will be dropped. 385 * 386 * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and 387 * line number where the error was identified. 388 */ 389 void csv2tsv(InputSource, OutputRange)( 390 InputSource inputSource, 391 auto ref OutputRange outputStream, 392 ubyte[] readBuffer, 393 string filename = "(none)", 394 size_t skipLines = 0, 395 const char csvQuote = '"', 396 const char csvDelim = ',', 397 const char tsvDelim = '\t', 398 const string tsvDelimReplacement = " ", 399 const string tsvNewlineReplacement = " ", 400 bool discardBOM = true, 401 ) 402 if (isBufferableInputSource!InputSource && 403 isOutputRange!(OutputRange, char)) 404 { 405 import std.conv: hexString; 406 407 assert (readBuffer.length >= 1); 408 409 enum char LF = '\n'; 410 enum char CR = '\r'; 411 412 enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf"; 413 414 /* Process state information - These variables are defined either in the outer 415 * context or within one of the foreach loops. 416 * 417 * * recordNum - The current CSV input line/record number. Starts at one. 418 * * fieldNum - Field number in the current line/record. Field numbers are 419 * one-upped. The field number set to zero at the start of a new record, 420 * prior to processing the first character of the first field on the record. 421 * * csvState - The current state of CSV processing. In particular, the state 422 * of the finite state machine. 423 * * writeRegionStart - Read buffer index where the next write starts from. 424 * * nextIndex - The index of the current input ubyte being processed. The 425 * current write region extends from the writeRegionStart to nextIndex. 426 * * nextChar - The current input ubyte. The ubyte/char at nextIndex. 427 */ 428 429 enum CSVState 430 { 431 FieldEnd, // Start of input or after consuming a field or record delimiter. 432 NonQuotedField, // Processing a non-quoted field 433 QuotedField, // Processing a quoted field 434 QuoteInQuotedField, // Last char was a quote in a quoted field 435 CRAtFieldEnd, // Last char was a CR terminating a record/line 436 CRInQuotedField, // Last char was a CR in a quoted field 437 } 438 439 CSVState csvState = CSVState.FieldEnd; 440 size_t recordNum = 1; 441 size_t fieldNum = 0; 442 443 foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate) 444 { 445 size_t writeRegionStart = 0; 446 447 /* Discard byte order marks at the start of input. 448 * Note: Slicing the chunk in this fashion generates very good code, better 449 * other approaches like manipulating indices. 450 */ 451 auto inputChunk = 452 (discardBOM && 453 chunkIndex == 0 && 454 inputChunkComplete.length >= UTF8_BOM.length && 455 inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM 456 ) 457 ? inputChunkComplete[UTF8_BOM.length .. $] 458 : inputChunkComplete[]; 459 460 /* flushCurrentRegion flushes the current write region and moves the start of 461 * the next write region one byte past the end of the current region. If 462 * appendChars are provided they are ouput as well. 463 * 464 * This routine is called when the current character (byte) terminates the 465 * current write region and should not itself be output. That is why the next 466 * write region always starts one byte past the current region end. 467 * 468 * This routine is also called when the 'skiplines' region has been processed. 469 * This is done to flush the region without actually writing it. This is done 470 * by the 'nextRecord' routine defined in the foreach loop. 471 */ 472 void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "") 473 { 474 assert(regionEnd <= inputChunk.length); 475 476 if (recordNum > skipLines) 477 { 478 if (regionEnd > writeRegionStart) 479 { 480 outputStream.put(inputChunk[writeRegionStart .. regionEnd]); 481 } 482 if (appendChars.length > 0) 483 { 484 outputStream.put(appendChars); 485 } 486 } 487 488 writeRegionStart = regionEnd + 1; 489 } 490 491 foreach (size_t nextIndex, char nextChar; inputChunk) 492 { 493 /* nextRecord is used when an end of record (end of line) is found. It 494 * bump the record number moves resets the field number. It also flushes 495 * the current write region if the line we were on was the last line 496 * being skipped at the start of input. Normally the header line. 497 */ 498 void nextRecord() 499 { 500 if (recordNum == skipLines) flushCurrentRegion(nextIndex); 501 ++recordNum; 502 fieldNum = 0; 503 } 504 505 OuterSwitch: final switch (csvState) 506 { 507 case CSVState.FieldEnd: 508 /* Start of input or after consuming a field terminator. */ 509 ++fieldNum; 510 511 /* Note: Can't use switch due to the 'goto case' to the OuterSwitch. */ 512 if (nextChar == csvQuote) 513 { 514 flushCurrentRegion(nextIndex); 515 csvState = CSVState.QuotedField; 516 break OuterSwitch; 517 } 518 else 519 { 520 /* Processing state change only. Don't consume the character. */ 521 csvState = CSVState.NonQuotedField; 522 goto case CSVState.NonQuotedField; 523 } 524 525 case CSVState.NonQuotedField: 526 switch (nextChar) 527 { 528 default: 529 break OuterSwitch; 530 case csvDelim: 531 inputChunk[nextIndex] = tsvDelim; 532 csvState = CSVState.FieldEnd; 533 break OuterSwitch; 534 case LF: 535 nextRecord(); 536 csvState = CSVState.FieldEnd; 537 break OuterSwitch; 538 case CR: 539 inputChunk[nextIndex] = LF; 540 nextRecord(); 541 csvState = CSVState.CRAtFieldEnd; 542 break OuterSwitch; 543 case tsvDelim: 544 if (tsvDelimReplacement.length == 1) 545 { 546 inputChunk[nextIndex] = tsvDelimReplacement[0]; 547 } 548 else 549 { 550 flushCurrentRegion(nextIndex, tsvDelimReplacement); 551 } 552 break OuterSwitch; 553 } 554 555 case CSVState.QuotedField: 556 switch (nextChar) 557 { 558 default: 559 break OuterSwitch; 560 case csvQuote: 561 /* 562 * Flush the current region, without the double quote. Switch state 563 * to QuoteInQuotedField, which determines whether to output a quote. 564 */ 565 flushCurrentRegion(nextIndex); 566 csvState = CSVState.QuoteInQuotedField; 567 break OuterSwitch; 568 569 case tsvDelim: 570 if (tsvDelimReplacement.length == 1) 571 { 572 inputChunk[nextIndex] = tsvDelimReplacement[0]; 573 } 574 else 575 { 576 flushCurrentRegion(nextIndex, tsvDelimReplacement); 577 } 578 break OuterSwitch; 579 case LF: 580 /* Newline in a quoted field. */ 581 if (tsvNewlineReplacement.length == 1) 582 { 583 inputChunk[nextIndex] = tsvNewlineReplacement[0]; 584 } 585 else 586 { 587 flushCurrentRegion(nextIndex, tsvNewlineReplacement); 588 } 589 break OuterSwitch; 590 case CR: 591 /* Carriage Return in a quoted field. */ 592 if (tsvNewlineReplacement.length == 1) 593 { 594 inputChunk[nextIndex] = tsvNewlineReplacement[0]; 595 } 596 else 597 { 598 flushCurrentRegion(nextIndex, tsvNewlineReplacement); 599 } 600 csvState = CSVState.CRInQuotedField; 601 break OuterSwitch; 602 } 603 604 case CSVState.QuoteInQuotedField: 605 /* Just processed a quote in a quoted field. The buffer, without the 606 * quote, was just flushed. Only legal characters here are quote, 607 * comma (field delimiter), newline (record delimiter). 608 */ 609 switch (nextChar) 610 { 611 case csvQuote: 612 csvState = CSVState.QuotedField; 613 break OuterSwitch; 614 case csvDelim: 615 inputChunk[nextIndex] = tsvDelim; 616 csvState = CSVState.FieldEnd; 617 break OuterSwitch; 618 case LF: 619 nextRecord(); 620 csvState = CSVState.FieldEnd; 621 break OuterSwitch; 622 case CR: 623 inputChunk[nextIndex] = LF; 624 nextRecord(); 625 csvState = CSVState.CRAtFieldEnd; 626 break OuterSwitch; 627 default: 628 throw new Exception( 629 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 630 (filename == "-") ? "Standard Input" : filename, 631 recordNum)); 632 } 633 634 case CSVState.CRInQuotedField: 635 if (nextChar == LF) 636 { 637 flushCurrentRegion(nextIndex); 638 csvState = CSVState.QuotedField; 639 break OuterSwitch; 640 } 641 else { 642 /* Naked CR. State change only, don't consume current character. */ 643 csvState = CSVState.QuotedField; 644 goto case CSVState.QuotedField; 645 } 646 647 case CSVState.CRAtFieldEnd: 648 if (nextChar == LF) 649 { 650 flushCurrentRegion(nextIndex); 651 csvState = CSVState.FieldEnd; 652 break OuterSwitch; 653 } 654 else { 655 /* Naked CR. State change only, don't consume current character. */ 656 csvState = CSVState.FieldEnd; 657 goto case CSVState.FieldEnd; 658 } 659 } 660 } 661 662 /* End of buffer. */ 663 if (writeRegionStart < inputChunk.length && recordNum > skipLines) 664 { 665 outputStream.put(inputChunk[writeRegionStart .. $]); 666 } 667 668 writeRegionStart = 0; 669 } 670 671 enforce(csvState != CSVState.QuotedField, 672 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 673 (filename == "-") ? "Standard Input" : filename, 674 recordNum)); 675 676 /* Output a newline if the CSV input did not have a terminating newline. */ 677 if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n'); 678 } 679 680 unittest 681 { 682 /* Unit tests for the csv2tsv function. 683 * 684 * These unit tests exercise different CSV combinations and escaping cases. The CSV 685 * data content is the same for each corresponding test string, except the delimiters 686 * have been changed. e.g csv6a and csv6b have the same data content. 687 * 688 * A property used in these tests is that changing the CSV delimiters doesn't change 689 * the resulting TSV. However, changing the TSV delimiters will change the TSV result, 690 * as TSV doesn't support having it's delimiters in the data. This allows having a 691 * single TSV expected set that is generated by CSVs with different delimter sets. 692 * 693 * This test set does not test main, file handling, or error messages. These are 694 * handled by tests run against the executable. 695 * 696 * Note: unittest is non @safe due to the casts from string to ubyte[]. This can 697 * probably be rewritten to use std.string.representation instead, which is @safe. 698 */ 699 700 /* Default CSV. */ 701 auto csv1a = "a,b,c"; 702 auto csv2a = "a,bc,,,def"; 703 auto csv3a = ",a, b , cd ,"; 704 auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石"; 705 auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\""; 706 auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\""; 707 auto csv7a = "\",\",\",,\",\",,,\""; 708 auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\""; 709 auto csv9a = "\"ab, de\tfg\"\"\nhij\""; 710 auto csv10a = ""; 711 auto csv11a = ","; 712 auto csv12a = ",,"; 713 auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\""; 714 auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\""; 715 auto csv15a = "\"ab, de\tfg\"\"\rhij\""; 716 auto csv16a = "\"ab, de\tfg\"\"\r\nhij\""; 717 auto csv17a = "ab\",ab\"cd"; 718 auto csv18a = "\n\n\n"; 719 auto csv19a = "\t"; 720 auto csv20a = "\t\t"; 721 auto csv21a = "a\n"; 722 auto csv22a = "a,\n"; 723 auto csv23a = "a,b\n"; 724 auto csv24a = ",\n"; 725 auto csv25a = "#"; 726 auto csv26a = "^"; 727 auto csv27a = "#^#"; 728 auto csv28a = "^#^"; 729 auto csv29a = "$"; 730 auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n"; 731 auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n"; 732 auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\""; 733 734 // Newlines terminating a line ending a non-quoted field 735 auto csv33a = "\rX\r\nX\n\r\nX\r\n"; 736 737 // Newlines inside a quoted field and terminating a line following a quoted field 738 auto csv34a = "\"\r\",\"X\r\",\"X\rY\",\"\rY\"\r\"\r\n\",\"X\r\n\",\"X\r\nY\",\"\r\nY\"\r\n\"\n\",\"X\n\",\"X\nY\",\"\nY\"\n"; 739 740 // CR at field end 741 auto csv35a = "abc,def\r\"ghi\",\"jkl\"\r\"mno\",pqr\r"; 742 743 /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */ 744 auto csv1b = "a^b^c"; 745 auto csv2b = "a^bc^^^def"; 746 auto csv3b = "^a^ b ^ cd ^"; 747 auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石"; 748 auto csv5b = "#\n#^#\n\n#^#\n\n\n#"; 749 auto csv6b = "#\t#^#\t\t#^#\t\t\t#"; 750 auto csv7b = "#,#^#,,#^#,,,#"; 751 auto csv8b = "##^#\"#^#\"\"#"; 752 auto csv9b = "#ab, de\tfg\"\nhij#"; 753 auto csv10b = ""; 754 auto csv11b = "^"; 755 auto csv12b = "^^"; 756 auto csv13b = "#\r#^#\r\r#^#\r\r\r#"; 757 auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#"; 758 auto csv15b = "#ab, de\tfg\"\rhij#"; 759 auto csv16b = "#ab, de\tfg\"\r\nhij#"; 760 auto csv17b = "ab\"^ab\"cd"; 761 auto csv18b = "\n\n\n"; 762 auto csv19b = "\t"; 763 auto csv20b = "\t\t"; 764 auto csv21b = "a\n"; 765 auto csv22b = "a^\n"; 766 auto csv23b = "a^b\n"; 767 auto csv24b = "^\n"; 768 auto csv25b = "####"; 769 auto csv26b = "#^#"; 770 auto csv27b = "###^###"; 771 auto csv28b = "#^##^#"; 772 auto csv29b = "$"; 773 auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n"; 774 auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n"; 775 auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#"; 776 auto csv33b = "\rX\r\nX\n\r\nX\r\n"; 777 auto csv34b = "#\r#^#X\r#^#X\rY#^#\rY#\r#\r\n#^#X\r\n#^#X\r\nY#^#\r\nY#\r\n#\n#^#X\n#^#X\nY#^#\nY#\n"; 778 auto csv35b = "abc^def\r#ghi#^#jkl#\r#mno#^pqr\r"; 779 780 /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/ 781 auto tsv1 = "a\tb\tc\n"; 782 auto tsv2 = "a\tbc\t\t\tdef\n"; 783 auto tsv3 = "\ta\t b \t cd \t\n"; 784 auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 785 auto tsv5 = " \t \t \n"; 786 auto tsv6 = " \t \t \n"; 787 auto tsv7 = ",\t,,\t,,,\n"; 788 auto tsv8 = "\t\"\t\"\"\n"; 789 auto tsv9 = "ab, de fg\" hij\n"; 790 auto tsv10 = ""; 791 auto tsv11 = "\t\n"; 792 auto tsv12 = "\t\t\n"; 793 auto tsv13 = " \t \t \n"; 794 auto tsv14 = " \t \t \n"; 795 auto tsv15 = "ab, de fg\" hij\n"; 796 auto tsv16 = "ab, de fg\" hij\n"; 797 auto tsv17 = "ab\"\tab\"cd\n"; 798 auto tsv18 = "\n\n\n"; 799 auto tsv19 = " \n"; 800 auto tsv20 = " \n"; 801 auto tsv21 = "a\n"; 802 auto tsv22 = "a\t\n"; 803 auto tsv23 = "a\tb\n"; 804 auto tsv24 = "\t\n"; 805 auto tsv25 = "#\n"; 806 auto tsv26 = "^\n"; 807 auto tsv27 = "#^#\n"; 808 auto tsv28 = "^#^\n"; 809 auto tsv29 = "$\n"; 810 auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 811 auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 812 auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 813 auto tsv33 = "\nX\nX\n\nX\n"; 814 auto tsv34 = " \tX \tX Y\t Y\n \tX \tX Y\t Y\n \tX \tX Y\t Y\n"; 815 auto tsv35 = "abc\tdef\nghi\tjkl\nmno\tpqr\n"; 816 817 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab. 818 * This will also result in different replacements when TAB and $ appear in the CSV. 819 */ 820 auto tsv1_x = "a$b$c\n"; 821 auto tsv2_x = "a$bc$$$def\n"; 822 auto tsv3_x = "$a$ b $ cd $\n"; 823 auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 824 auto tsv5_x = " $ $ \n"; 825 auto tsv6_x = "\t$\t\t$\t\t\t\n"; 826 auto tsv7_x = ",$,,$,,,\n"; 827 auto tsv8_x = "$\"$\"\"\n"; 828 auto tsv9_x = "ab, de\tfg\" hij\n"; 829 auto tsv10_x = ""; 830 auto tsv11_x = "$\n"; 831 auto tsv12_x = "$$\n"; 832 auto tsv13_x = " $ $ \n"; 833 auto tsv14_x = " $ $ \n"; 834 auto tsv15_x = "ab, de\tfg\" hij\n"; 835 auto tsv16_x = "ab, de\tfg\" hij\n"; 836 auto tsv17_x = "ab\"$ab\"cd\n"; 837 auto tsv18_x = "\n\n\n"; 838 auto tsv19_x = "\t\n"; 839 auto tsv20_x = "\t\t\n"; 840 auto tsv21_x = "a\n"; 841 auto tsv22_x = "a$\n"; 842 auto tsv23_x = "a$b\n"; 843 auto tsv24_x = "$\n"; 844 auto tsv25_x = "#\n"; 845 auto tsv26_x = "^\n"; 846 auto tsv27_x = "#^#\n"; 847 auto tsv28_x = "^#^\n"; 848 auto tsv29_x = " \n"; 849 auto tsv30_x = " $ \n $ $ \n^# $ #^$# ^$^ #\n"; 850 auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 851 auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 852 auto tsv33_x = "\nX\nX\n\nX\n"; 853 auto tsv34_x = " $X $X Y$ Y\n $X $X Y$ Y\n $X $X Y$ Y\n"; 854 auto tsv35_x = "abc$def\nghi$jkl\nmno$pqr\n"; 855 856 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab, 857 * and with the delimiter/newline replacement string being |--|. Basically, newlines 858 * and '$' in the original data are replaced by |--|. 859 */ 860 auto tsv1_y = "a$b$c\n"; 861 auto tsv2_y = "a$bc$$$def\n"; 862 auto tsv3_y = "$a$ b $ cd $\n"; 863 auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 864 auto tsv5_y = "|--|$|--||--|$|--||--||--|\n"; 865 auto tsv6_y = "\t$\t\t$\t\t\t\n"; 866 auto tsv7_y = ",$,,$,,,\n"; 867 auto tsv8_y = "$\"$\"\"\n"; 868 auto tsv9_y = "ab, de\tfg\"|--|hij\n"; 869 auto tsv10_y = ""; 870 auto tsv11_y = "$\n"; 871 auto tsv12_y = "$$\n"; 872 auto tsv13_y = "|--|$|--||--|$|--||--||--|\n"; 873 auto tsv14_y = "|--|$|--||--|$|--||--||--|\n"; 874 auto tsv15_y = "ab, de\tfg\"|--|hij\n"; 875 auto tsv16_y = "ab, de\tfg\"|--|hij\n"; 876 auto tsv17_y = "ab\"$ab\"cd\n"; 877 auto tsv18_y = "\n\n\n"; 878 auto tsv19_y = "\t\n"; 879 auto tsv20_y = "\t\t\n"; 880 auto tsv21_y = "a\n"; 881 auto tsv22_y = "a$\n"; 882 auto tsv23_y = "a$b\n"; 883 auto tsv24_y = "$\n"; 884 auto tsv25_y = "#\n"; 885 auto tsv26_y = "^\n"; 886 auto tsv27_y = "#^#\n"; 887 auto tsv28_y = "^#^\n"; 888 auto tsv29_y = "|--|\n"; 889 auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n"; 890 auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 891 auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 892 auto tsv33_y = "\nX\nX\n\nX\n"; 893 auto tsv34_y = "|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n"; 894 auto tsv35_y = "abc$def\nghi$jkl\nmno$pqr\n"; 895 896 /* The TSV results for CSV sets 1a and 1b, but with the TAB replacement as |TAB| 897 * and newline replacement |NL|. 898 */ 899 auto tsv1_z = "a\tb\tc\n"; 900 auto tsv2_z = "a\tbc\t\t\tdef\n"; 901 auto tsv3_z = "\ta\t b \t cd \t\n"; 902 auto tsv4_z = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 903 auto tsv5_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n"; 904 auto tsv6_z = "<TAB>\t<TAB><TAB>\t<TAB><TAB><TAB>\n"; 905 auto tsv7_z = ",\t,,\t,,,\n"; 906 auto tsv8_z = "\t\"\t\"\"\n"; 907 auto tsv9_z = "ab, de<TAB>fg\"<NL>hij\n"; 908 auto tsv10_z = ""; 909 auto tsv11_z = "\t\n"; 910 auto tsv12_z = "\t\t\n"; 911 auto tsv13_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n"; 912 auto tsv14_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n"; 913 auto tsv15_z = "ab, de<TAB>fg\"<NL>hij\n"; 914 auto tsv16_z = "ab, de<TAB>fg\"<NL>hij\n"; 915 auto tsv17_z = "ab\"\tab\"cd\n"; 916 auto tsv18_z = "\n\n\n"; 917 auto tsv19_z = "<TAB>\n"; 918 auto tsv20_z = "<TAB><TAB>\n"; 919 auto tsv21_z = "a\n"; 920 auto tsv22_z = "a\t\n"; 921 auto tsv23_z = "a\tb\n"; 922 auto tsv24_z = "\t\n"; 923 auto tsv25_z = "#\n"; 924 auto tsv26_z = "^\n"; 925 auto tsv27_z = "#^#\n"; 926 auto tsv28_z = "^#^\n"; 927 auto tsv29_z = "$\n"; 928 auto tsv30_z = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 929 auto tsv31_z = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 930 auto tsv32_z = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 931 auto tsv33_z = "\nX\nX\n\nX\n"; 932 auto tsv34_z = "<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n"; 933 auto tsv35_z = "abc\tdef\nghi\tjkl\nmno\tpqr\n"; 934 935 /* Aggregate the test data into parallel arrays. */ 936 auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a, 937 csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a, 938 csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a, 939 csv31a, csv32a, csv33a, csv34a, csv35a]; 940 941 auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b, 942 csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b, 943 csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b, 944 csv31b, csv32b, csv33b, csv34b, csv35b]; 945 946 auto tsvSet1 = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10, 947 tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20, 948 tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30, 949 tsv31, tsv32, tsv33, tsv34, tsv35]; 950 951 auto tsvSet1_x = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x, 952 tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x, 953 tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x, 954 tsv31_x, tsv32_x, tsv33_x, tsv34_x, tsv35_x]; 955 956 auto tsvSet1_y = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y, 957 tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y, 958 tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y, 959 tsv31_y, tsv32_y, tsv33_y, tsv34_y, tsv35_y]; 960 961 auto tsvSet1_z = [tsv1_z, tsv2_z, tsv3_z, tsv4_z, tsv5_z, tsv6_z, tsv7_z, tsv8_z, tsv9_z, tsv10_z, 962 tsv11_z, tsv12_z, tsv13_z, tsv14_z, tsv15_z, tsv16_z, tsv17_z, tsv18_z, tsv19_z, tsv20_z, 963 tsv21_z, tsv22_z, tsv23_z, tsv24_z, tsv25_z, tsv26_z, tsv27_z, tsv28_z, tsv29_z, tsv30_z, 964 tsv31_z, tsv32_z, tsv33_z, tsv34_z, tsv35_z]; 965 966 /* The tests. */ 967 auto bufferSizeTests = [1, 2, 3, 8, 128]; 968 969 foreach (bufferSize; bufferSizeTests) 970 { 971 ubyte[] readBuffer = new ubyte[](bufferSize); 972 973 foreach (i, csva, csvb, tsv, tsv_x, tsv_y, tsv_z; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y, tsvSet1_z)) 974 { 975 import std.conv : to; 976 977 /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 978 ubyte[] csvInputA = cast(ubyte[])csva; 979 ubyte[] csvInputB = cast(ubyte[])csvb; 980 981 /* CSV Set A vs TSV expected. */ 982 auto tsvResultA = appender!(char[])(); 983 csv2tsv(csvInputA, tsvResultA, readBuffer, "csvInputA_defaultTSV"); 984 assert(tsv == tsvResultA.data, 985 format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 986 i + 1, csva, tsv, tsvResultA.data)); 987 988 /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/ 989 auto tsvResultB = appender!(char[])(); 990 csv2tsv(csvInputB, tsvResultB, readBuffer, "csvInputB_defaultTSV", 0, '#', '^'); 991 assert(tsv == tsvResultB.data, 992 format("Unittest failure. tsv != tsvResultB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 993 i + 1, csvb, tsv, tsvResultB.data)); 994 995 /* CSV Set A and TSV with $ separator.*/ 996 csvInputA = cast(ubyte[])csva; 997 auto tsvResult_XA = appender!(char[])(); 998 csv2tsv(csvInputA, tsvResult_XA, readBuffer, "csvInputA_TSV_WithDollarDelimiter", 0, '"', ',', '$'); 999 assert(tsv_x == tsvResult_XA.data, 1000 format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1001 i + 1, csva, tsv_x, tsvResult_XA.data)); 1002 1003 /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/ 1004 csvInputB = cast(ubyte[])csvb; 1005 auto tsvResult_XB = appender!(char[])(); 1006 csv2tsv(csvInputB, tsvResult_XB, readBuffer, "csvInputB__TSV_WithDollarDelimiter", 0, '#', '^', '$'); 1007 assert(tsv_x == tsvResult_XB.data, 1008 format("Unittest failure. tsv_x != tsvResult_XB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1009 i + 1, csvb, tsv_x, tsvResult_XB.data)); 1010 1011 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */ 1012 csvInputA = cast(ubyte[])csva; 1013 auto tsvResult_YA = appender!(char[])(); 1014 csv2tsv(csvInputA, tsvResult_YA, readBuffer, "csvInputA_TSV_WithDollarAndDelimReplacement", 0, '"', ',', '$', "|--|", "|--|"); 1015 assert(tsv_y == tsvResult_YA.data, 1016 format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1017 i + 1, csva, tsv_y, tsvResult_YA.data)); 1018 1019 /* CSV Set B and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/ 1020 csvInputB = cast(ubyte[])csvb; 1021 auto tsvResult_YB = appender!(char[])(); 1022 csv2tsv(csvInputB, tsvResult_YB, readBuffer, "csvInputB__TSV_WithDollarAndDelimReplacement", 0, '#', '^', '$', "|--|", "|--|"); 1023 assert(tsv_y == tsvResult_YB.data, 1024 format("Unittest failure. tsv_y != tsvResult_YB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1025 i + 1, csvb, tsv_y, tsvResult_YB.data)); 1026 1027 /* CSV Set A and TSV with TAB replacement as <TAB> and newline replacement as <NL>. Same TSV as CSV Set A.*/ 1028 csvInputA = cast(ubyte[])csva; 1029 auto tsvResult_ZA = appender!(char[])(); 1030 csv2tsv(csvInputA, tsvResult_ZA, readBuffer, "csvInputA_TSV_WithDifferentTABandNLReplacements", 0, '"', ',', '\t', "<TAB>", "<NL>"); 1031 assert(tsv_z == tsvResult_ZA.data, 1032 format("Unittest failure. tsv_z != tsvResult_ZA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1033 i + 1, csva, tsv_z, tsvResult_ZA.data)); 1034 } 1035 } 1036 } 1037 1038 // csv2tsv skiplines tests 1039 unittest 1040 { 1041 import std..string : representation; 1042 1043 auto csv1 = ""; 1044 auto csv2 = "a"; 1045 1046 auto csv3 = "\n"; 1047 auto csv4 = "\n\n"; 1048 auto csv5 = "\n\n\n"; 1049 1050 auto csv6 = "a\n"; 1051 auto csv7 = "a\nb\n"; 1052 auto csv8 = "a\nb\nc\n"; 1053 1054 auto csv9 = "\"\n\"\n"; 1055 auto csv10 = "\"\n\"\n\"\n\"\n"; 1056 auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n"; 1057 1058 auto csv12 = "\r"; 1059 auto csv13 = "\r\r"; 1060 auto csv14 = "\r\r\r"; 1061 1062 auto csv15 = "a\r"; 1063 auto csv16 = "a\rb\r"; 1064 auto csv17 = "a\rb\rc\r"; 1065 1066 auto csv18 = "\"\r\"\r"; 1067 auto csv19 = "\"\r\"\r\"\r\"\r"; 1068 auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r"; 1069 1070 auto csv21 = "\r\n"; 1071 auto csv22 = "\r\n\r\n"; 1072 auto csv23 = "\r\n\r\n\r\n"; 1073 1074 auto csv24 = "a\r\n"; 1075 auto csv25 = "a\r\nb\r\n"; 1076 auto csv26 = "a\r\nb\r\nc\r\n"; 1077 1078 auto csv27 = "\"\r\n\"\r\n"; 1079 auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n"; 1080 auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n"; 1081 1082 /* The Skip 1 expected results. */ 1083 auto tsv1Skip1 = ""; 1084 auto tsv2Skip1 = ""; 1085 1086 auto tsv3Skip1 = ""; 1087 auto tsv4Skip1 = "\n"; 1088 auto tsv5Skip1 = "\n\n"; 1089 1090 auto tsv6Skip1 = ""; 1091 auto tsv7Skip1 = "b\n"; 1092 auto tsv8Skip1 = "b\nc\n"; 1093 1094 auto tsv9Skip1 = ""; 1095 auto tsv10Skip1 = " \n"; 1096 auto tsv11Skip1 = " \n \n"; 1097 1098 auto tsv12Skip1 = ""; 1099 auto tsv13Skip1 = "\n"; 1100 auto tsv14Skip1 = "\n\n"; 1101 1102 auto tsv15Skip1 = ""; 1103 auto tsv16Skip1 = "b\n"; 1104 auto tsv17Skip1 = "b\nc\n"; 1105 1106 auto tsv18Skip1 = ""; 1107 auto tsv19Skip1 = " \n"; 1108 auto tsv20Skip1 = " \n \n"; 1109 1110 auto tsv21Skip1 = ""; 1111 auto tsv22Skip1 = "\n"; 1112 auto tsv23Skip1 = "\n\n"; 1113 1114 auto tsv24Skip1 = ""; 1115 auto tsv25Skip1 = "b\n"; 1116 auto tsv26Skip1 = "b\nc\n"; 1117 1118 auto tsv27Skip1 = ""; 1119 auto tsv28Skip1 = " \n"; 1120 auto tsv29Skip1 = " \n \n"; 1121 1122 /* The Skip 2 expected results. */ 1123 auto tsv1Skip2 = ""; 1124 auto tsv2Skip2 = ""; 1125 1126 auto tsv3Skip2 = ""; 1127 auto tsv4Skip2 = ""; 1128 auto tsv5Skip2 = "\n"; 1129 1130 auto tsv6Skip2 = ""; 1131 auto tsv7Skip2 = ""; 1132 auto tsv8Skip2 = "c\n"; 1133 1134 auto tsv9Skip2 = ""; 1135 auto tsv10Skip2 = ""; 1136 auto tsv11Skip2 = " \n"; 1137 1138 auto tsv12Skip2 = ""; 1139 auto tsv13Skip2 = ""; 1140 auto tsv14Skip2 = "\n"; 1141 1142 auto tsv15Skip2 = ""; 1143 auto tsv16Skip2 = ""; 1144 auto tsv17Skip2 = "c\n"; 1145 1146 auto tsv18Skip2 = ""; 1147 auto tsv19Skip2 = ""; 1148 auto tsv20Skip2 = " \n"; 1149 1150 auto tsv21Skip2 = ""; 1151 auto tsv22Skip2 = ""; 1152 auto tsv23Skip2 = "\n"; 1153 1154 auto tsv24Skip2 = ""; 1155 auto tsv25Skip2 = ""; 1156 auto tsv26Skip2 = "c\n"; 1157 1158 auto tsv27Skip2 = ""; 1159 auto tsv28Skip2 = ""; 1160 auto tsv29Skip2 = " \n"; 1161 1162 auto csvSet = 1163 [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10, 1164 csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20, 1165 csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29]; 1166 1167 auto tsvSkip1Set = 1168 [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1, 1169 tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1, 1170 tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1]; 1171 1172 auto tsvSkip2Set = 1173 [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2, 1174 tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2, 1175 tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2]; 1176 1177 auto bufferSizeTests = [1, 2, 3, 4, 8, 128]; 1178 1179 foreach (bufferSize; bufferSizeTests) 1180 { 1181 ubyte[] readBuffer = new ubyte[](bufferSize); 1182 1183 foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set)) 1184 { 1185 ubyte[] csvInput = csv.dup.representation; 1186 auto csvToTSVSkip1 = appender!(char[])(); 1187 auto csvToTSVSkip2 = appender!(char[])(); 1188 1189 csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1); 1190 1191 assert(tsvSkip1 == csvToTSVSkip1.data, 1192 format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1193 i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data)); 1194 1195 csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2); 1196 1197 assert(tsvSkip2 == csvToTSVSkip2.data, 1198 format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1199 i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data)); 1200 } 1201 } 1202 } 1203 1204 // csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe 1205 unittest 1206 { 1207 import std.conv : hexString; 1208 import std..string : representation; 1209 1210 enum utf8BOM = hexString!"efbbbf"; 1211 1212 auto csv1 = ""; 1213 auto csv2 = "a"; 1214 auto csv3 = "ab"; 1215 auto csv4 = "a,b"; 1216 auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz"; 1217 1218 auto csv1BOM = utf8BOM ~ csv1; 1219 auto csv2BOM = utf8BOM ~ csv2; 1220 auto csv3BOM = utf8BOM ~ csv3; 1221 auto csv4BOM = utf8BOM ~ csv4; 1222 auto csv5BOM = utf8BOM ~ csv5; 1223 1224 auto tsv1 = ""; 1225 auto tsv2 = "a\n"; 1226 auto tsv3 = "ab\n"; 1227 auto tsv4 = "a\tb\n"; 1228 auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n"; 1229 1230 /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline. 1231 * However, with the BOM prepended the tsv gets a trailing newline. 1232 */ 1233 auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n"; 1234 auto tsv2BOM = utf8BOM ~ tsv2; 1235 auto tsv3BOM = utf8BOM ~ tsv3; 1236 auto tsv4BOM = utf8BOM ~ tsv4; 1237 auto tsv5BOM = utf8BOM ~ tsv5; 1238 1239 auto csvSet = [csv1, csv2, csv3, csv4, csv5]; 1240 auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM]; 1241 1242 auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5]; 1243 auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM]; 1244 1245 auto bufferSizeTests = [1, 2, 3, 4, 8, 128]; 1246 1247 foreach (bufferSize; bufferSizeTests) 1248 { 1249 ubyte[] readBuffer = new ubyte[](bufferSize); 1250 1251 foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet)) 1252 { 1253 ubyte[] csvInput = csv.dup.representation; 1254 ubyte[] csvBOMInput = csvBOM.dup.representation; 1255 1256 auto csvToTSV = appender!(char[])(); 1257 auto csvToTSV_NoBOMRemoval = appender!(char[])(); 1258 auto csvBOMToTSV = appender!(char[])(); 1259 auto csvBOMToTSV_NoBOMRemoval = appender!(char[])(); 1260 1261 csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true); 1262 assert(tsv == csvToTSV.data, 1263 format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1264 i + 1, bufferSize, csv, tsv, csvToTSV.data)); 1265 1266 csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false); 1267 assert(tsv == csvToTSV_NoBOMRemoval.data, 1268 format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1269 i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data)); 1270 1271 csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true); 1272 if (readBuffer.length < utf8BOM.length) 1273 { 1274 /* Removing BOMs, but didn't provide enough buffer, so no removal. */ 1275 assert(tsvBOM == csvBOMToTSV.data, 1276 format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1277 i + 1, bufferSize, csv, tsv, csvBOMToTSV.data)); 1278 } 1279 else 1280 { 1281 assert(tsv == csvBOMToTSV.data, 1282 format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1283 i + 1, bufferSize, csv, tsv, csvBOMToTSV.data)); 1284 } 1285 1286 csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false); 1287 assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data, 1288 format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 1289 i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data)); 1290 } 1291 } 1292 }