1 /** 2 Convert CSV formatted data to TSV format. 3 4 This program converts comma-separated value data to tab-separated format. 5 6 Copyright (c) 2016-2018, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 12 module csv2tsv; 13 14 import std.stdio; 15 import std.format : format; 16 import std.range; 17 import std.traits : Unqual; 18 import std.typecons : Nullable, tuple; 19 20 auto helpText = q"EOS 21 Synopsis: csv2tsv [options] [file...] 22 23 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records 24 are read from files or standard input, converted records written to standard output. 25 Use '--help-verbose' for details the CSV formats accepted. 26 27 Options: 28 EOS"; 29 30 auto helpTextVerbose = q"EOS 31 Synopsis: csv2tsv [options] [file...] 32 33 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records 34 are read from files or standard input, converted records written to standard output. 35 36 Both formats represent tabular data, each record on its own line, fields separated 37 by a delimiter character. The key difference is that CSV uses escape sequences to 38 represent newlines and field separators in the data, whereas TSV disallows these 39 characters in the data. The most common field delimiters are comma for CSV and tab 40 for TSV, but any character can be used. 41 42 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, 43 and replacing newlines and field delimiters in the data. By default, newlines and 44 field delimiters in the data are replaced by spaces. Most details are customizable. 45 46 There is no single spec for CSV, any number of variants can be found. The escape 47 syntax is common enough: fields containing newlines or field delimiters are placed 48 in double quotes. Inside a quoted field, a double quote is represented by a pair of 49 double quotes. As with field separators, the quoting character is customizable. 50 51 Behaviors of this program that often vary between CSV implementations: 52 * Newlines are supported in quoted fields. 53 * Double quotes are permitted in a non-quoted field. However, a field starting 54 with a quote must follow quoting rules. 55 * Each record can have a different numbers of fields. 56 * The three common forms of newlines are supported: CR, CRLF, LF. 57 * A newline will be added if the file does not end with one. 58 * No whitespace trimming is done. 59 60 This program does not validate CSV correctness, but will terminate with an error 61 upon reaching an inconsistent state. Improperly terminated quoted fields are the 62 primary cause. 63 64 UTF-8 input is assumed. Convert other encodings prior to invoking this tool. 65 66 Options: 67 EOS"; 68 69 /** Container for command line options. 70 */ 71 struct Csv2tsvOptions 72 { 73 string programName; 74 bool helpVerbose = false; // --help-verbose 75 bool hasHeader = false; // --H|header 76 char csvQuoteChar = '"'; // --q|quote 77 char csvDelimChar = ','; // --c|csv-delim 78 char tsvDelimChar = '\t'; // --t|tsv-delim 79 string tsvDelimReplacement = " "; // --r|replacement 80 bool versionWanted = false; // --V|version 81 82 auto processArgs (ref string[] cmdArgs) 83 { 84 import std.algorithm : canFind; 85 import std.getopt; 86 import std.path : baseName, stripExtension; 87 88 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 89 90 try 91 { 92 auto r = getopt( 93 cmdArgs, 94 "help-verbose", " Print full help.", &helpVerbose, 95 std.getopt.config.caseSensitive, 96 "H|header", " Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader, 97 std.getopt.config.caseSensitive, 98 "q|quote", "CHR Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar, 99 "c|csv-delim", "CHR Field delimiter in CSV data. Default: comma (,).", &csvDelimChar, 100 "t|tsv-delim", "CHR Field delimiter in TSV data. Default: TAB", &tsvDelimChar, 101 "r|replacement", "STR Replacement for newline and TSV field delimiters found in CSV input. Default: Space.", &tsvDelimReplacement, 102 std.getopt.config.caseSensitive, 103 "V|version", " Print version information and exit.", &versionWanted, 104 std.getopt.config.caseInsensitive, 105 ); 106 107 if (r.helpWanted) 108 { 109 defaultGetoptPrinter(helpText, r.options); 110 return tuple(false, 0); 111 } 112 else if (helpVerbose) 113 { 114 defaultGetoptPrinter(helpTextVerbose, r.options); 115 return tuple(false, 0); 116 } 117 else if (versionWanted) 118 { 119 import tsvutils_version; 120 writeln(tsvutilsVersionNotice("csv2tsv")); 121 return tuple(false, 0); 122 } 123 124 /* Consistency checks. */ 125 if (csvQuoteChar == '\n' || csvQuoteChar == '\r') 126 { 127 throw new Exception ("CSV quote character cannot be newline (--q|quote)."); 128 } 129 130 if (csvQuoteChar == csvDelimChar) 131 { 132 throw new Exception("CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim)."); 133 } 134 135 if (csvQuoteChar == tsvDelimChar) 136 { 137 throw new Exception("CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim)."); 138 } 139 140 if (csvDelimChar == '\n' || csvDelimChar == '\r') 141 { 142 throw new Exception ("CSV field delimiter cannot be newline (--c|csv-delim)."); 143 } 144 145 if (tsvDelimChar == '\n' || tsvDelimChar == '\r') 146 { 147 throw new Exception ("TSV field delimiter cannot be newline (--t|tsv-delimiter)."); 148 } 149 150 if (canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement)) 151 { 152 throw new Exception ("Replacement character cannot contain newlines or TSV field delimiters (--r|replacement)."); 153 } 154 } 155 catch (Exception exc) 156 { 157 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 158 return tuple(false, 1); 159 } 160 return tuple(true, 0); 161 } 162 } 163 164 version(unittest) 165 { 166 // No main in unittest 167 } 168 else 169 { 170 int main(string[] cmdArgs) 171 { 172 /* When running in DMD code coverage mode, turn on report merging. */ 173 version(D_Coverage) version(DigitalMars) 174 { 175 import core.runtime : dmd_coverSetMerge; 176 dmd_coverSetMerge(true); 177 } 178 179 Csv2tsvOptions cmdopt; 180 auto r = cmdopt.processArgs(cmdArgs); 181 if (!r[0]) return r[1]; 182 version(LDC_Profile) 183 { 184 import ldc.profile : resetAll; 185 resetAll(); 186 } 187 try csv2tsvFiles(cmdopt, cmdArgs[1..$]); 188 catch (Exception exc) 189 { 190 writeln(); 191 stdin.flush(); 192 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 193 return 1; 194 } 195 196 return 0; 197 } 198 } 199 200 /* This uses a D feature where a type can reserve a single value to represent null. */ 201 alias NullableSizeT = Nullable!(size_t, size_t.max); 202 203 204 /** csv2tsvFiles reads multiple files and standard input and writes the results to 205 * standard output. 206 */ 207 void csv2tsvFiles(in Csv2tsvOptions cmdopt, in string[] inputFiles) 208 { 209 import std.algorithm : joiner; 210 import tsvutil : BufferedOutputRange; 211 212 ubyte[1024 * 1024] fileRawBuf; 213 ubyte[] stdinRawBuf = fileRawBuf[0..1024]; 214 auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout); 215 bool firstFile = true; 216 217 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 218 { 219 auto ubyteChunkedStream = (filename == "-") ? 220 stdin.byChunk(stdinRawBuf) : filename.File.byChunk(fileRawBuf); 221 auto ubyteStream = ubyteChunkedStream.joiner; 222 223 if (firstFile || !cmdopt.hasHeader) 224 { 225 csv2tsv(ubyteStream, stdoutWriter, filename, 0, 226 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 227 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement); 228 } 229 else 230 { 231 /* Don't write the header on subsequent files. Write the first 232 * record to a null sink instead. 233 */ 234 auto nullWriter = NullSink(); 235 csv2tsv(ubyteStream, nullWriter, filename, 0, 236 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 237 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement, 238 NullableSizeT(1)); 239 csv2tsv(ubyteStream, stdoutWriter, filename, 1, 240 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 241 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement); 242 } 243 firstFile = false; 244 } 245 } 246 247 /** Read CSV from an input source, covert to TSV and write to an output source. 248 * 249 * Params: 250 * InputRange = A ubyte input range to read CSV text from. A ubyte range 251 * matched byChunck. It also avoids convesion to dchar by front(). 252 * OutputRange = An output range to write TSV text to. 253 * filename = Name of file to use when reporting errors. A descriptive name 254 * = can be used in lieu of a file name. 255 * currFileLineNumber = First line being processed. Used when reporting errors. Needed 256 * only when part of the input has already been processed. 257 * csvQuote = The quoting character used in the input CSV file. 258 * csvDelim = The field delimiter character used in the input CSV file. 259 * tsvDelim = The field delimiter character to use in the generated TSV file. 260 * tsvDelimReplacement = A string to use when replacing newlines and TSV field delimiters 261 * occurring in CSV fields. 262 * maxRecords = The maximum number of records to process (output lines). This is 263 * intended to support processing the header line separately. 264 * 265 * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and 266 * line number where the error was identified. 267 */ 268 void csv2tsv(InputRange, OutputRange) 269 (auto ref InputRange inputStream, auto ref OutputRange outputStream, 270 string filename = "(none)", size_t currFileLineNumber = 0, 271 const char csvQuote = '"', const char csvDelim = ',', const char tsvDelim = '\t', 272 string tsvDelimReplacement = " ", 273 NullableSizeT maxRecords=NullableSizeT.init, 274 ) 275 if (isInputRange!InputRange && isOutputRange!(OutputRange, char) && 276 is(Unqual!(ElementType!InputRange) == ubyte)) 277 { 278 enum State { FieldEnd, NonQuotedField, QuotedField, QuoteInQuotedField } 279 280 State currState = State.FieldEnd; 281 size_t recordNum = 1; // Record number. Output line number. 282 size_t fieldNum = 0; // Field on current line. 283 284 InputLoop: while (!inputStream.empty) 285 { 286 char nextChar = inputStream.front; 287 inputStream.popFront; 288 289 if (nextChar == '\r') 290 { 291 /* Collapse newline cases to '\n'. */ 292 if (!inputStream.empty && inputStream.front == '\n') 293 { 294 inputStream.popFront; 295 } 296 nextChar = '\n'; 297 } 298 299 OuterSwitch: final switch (currState) 300 { 301 case State.FieldEnd: 302 /* Start of input or after consuming a field terminator. */ 303 ++fieldNum; 304 305 /* Note: Can't use a switch here do the 'goto case' to the OuterSwitch. */ 306 if (nextChar == csvQuote) 307 { 308 currState = State.QuotedField; 309 break OuterSwitch; 310 } 311 else 312 { 313 /* Processing state change only. Don't consume the character. */ 314 currState = State.NonQuotedField; 315 goto case State.NonQuotedField; 316 } 317 318 case State.NonQuotedField: 319 switch (nextChar) 320 { 321 default: 322 put(outputStream, nextChar); 323 break OuterSwitch; 324 case csvDelim: 325 put(outputStream, tsvDelim); 326 currState = State.FieldEnd; 327 break OuterSwitch; 328 case tsvDelim: 329 put(outputStream, tsvDelimReplacement); 330 break OuterSwitch; 331 case '\n': 332 put(outputStream, '\n'); 333 ++recordNum; 334 fieldNum = 0; 335 currState = State.FieldEnd; 336 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop; 337 else break OuterSwitch; 338 } 339 340 case State.QuotedField: 341 switch (nextChar) 342 { 343 default: 344 put(outputStream, nextChar); 345 break OuterSwitch; 346 case csvQuote: 347 /* Quote in a quoted field. Need to look at the next character.*/ 348 if (!inputStream.empty) 349 { 350 currState = State.QuoteInQuotedField; 351 } 352 else 353 { 354 /* End of input. A rare case: Quoted field on last line with no 355 * following trailing newline. Reset the state to avoid triggering 356 * an invalid quoted field exception, plus adding additional newline. 357 */ 358 currState = State.FieldEnd; 359 } 360 break OuterSwitch; 361 case '\n': 362 /* Newline in a quoted field. */ 363 put(outputStream, tsvDelimReplacement); 364 break OuterSwitch; 365 case tsvDelim: 366 put(outputStream, tsvDelimReplacement); 367 break OuterSwitch; 368 } 369 370 case State.QuoteInQuotedField: 371 /* Just processed a quote in a quoted field. */ 372 switch (nextChar) 373 { 374 case csvQuote: 375 put(outputStream, csvQuote); 376 currState = State.QuotedField; 377 break OuterSwitch; 378 case csvDelim: 379 put(outputStream, tsvDelim); 380 currState = State.FieldEnd; 381 break OuterSwitch; 382 case '\n': 383 put(outputStream, '\n'); 384 ++recordNum; 385 fieldNum = 0; 386 currState = State.FieldEnd; 387 388 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop; 389 else break OuterSwitch; 390 default: 391 throw new Exception( 392 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 393 (filename == "-") ? "Standard Input" : filename, 394 currFileLineNumber + recordNum)); 395 } 396 } 397 } 398 399 if (currState == State.QuotedField) 400 { 401 throw new Exception( 402 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 403 (filename == "-") ? "Standard Input" : filename, 404 currFileLineNumber + recordNum)); 405 } 406 407 if (fieldNum > 0) put(outputStream, '\n'); // Last line w/o terminating newline. 408 } 409 410 unittest 411 { 412 /* Unit tests for the csv2tsv function. 413 * 414 * These unit tests exercise different CSV combinations and escaping cases. The CSV 415 * data content is the same for each corresponding test string, except the delimiters 416 * have been changed. e.g csv6a and csv6b have the same data content. 417 * 418 * A property used in these tests is that changing the CSV delimiters doesn't change 419 * the resulting TSV. However, changing the TSV delimiters will change the TSV result, 420 * as TSV doesn't support having it's delimiters in the data. This allows having a 421 * single TSV expected set that is generated by CSVs with different delimter sets. 422 * 423 * This test set does not test main, file handling, or error messages. These are 424 * handled by tests run against the executable. 425 */ 426 427 /* Default CSV. */ 428 auto csv1a = "a,b,c"; 429 auto csv2a = "a,bc,,,def"; 430 auto csv3a = ",a, b , cd ,"; 431 auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石"; 432 auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\""; 433 auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\""; 434 auto csv7a = "\",\",\",,\",\",,,\""; 435 auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\""; 436 auto csv9a = "\"ab, de\tfg\"\"\nhij\""; 437 auto csv10a = ""; 438 auto csv11a = ","; 439 auto csv12a = ",,"; 440 auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\""; 441 auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\""; 442 auto csv15a = "\"ab, de\tfg\"\"\rhij\""; 443 auto csv16a = "\"ab, de\tfg\"\"\r\nhij\""; 444 auto csv17a = "ab\",ab\"cd"; 445 auto csv18a = "\n\n\n"; 446 auto csv19a = "\t"; 447 auto csv20a = "\t\t"; 448 auto csv21a = "a\n"; 449 auto csv22a = "a,\n"; 450 auto csv23a = "a,b\n"; 451 auto csv24a = ",\n"; 452 auto csv25a = "#"; 453 auto csv26a = "^"; 454 auto csv27a = "#^#"; 455 auto csv28a = "^#^"; 456 auto csv29a = "$"; 457 auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n"; 458 auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n"; 459 auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\""; 460 461 /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */ 462 auto csv1b = "a^b^c"; 463 auto csv2b = "a^bc^^^def"; 464 auto csv3b = "^a^ b ^ cd ^"; 465 auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石"; 466 auto csv5b = "#\n#^#\n\n#^#\n\n\n#"; 467 auto csv6b = "#\t#^#\t\t#^#\t\t\t#"; 468 auto csv7b = "#,#^#,,#^#,,,#"; 469 auto csv8b = "##^#\"#^#\"\"#"; 470 auto csv9b = "#ab, de\tfg\"\nhij#"; 471 auto csv10b = ""; 472 auto csv11b = "^"; 473 auto csv12b = "^^"; 474 auto csv13b = "#\r#^#\r\r#^#\r\r\r#"; 475 auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#"; 476 auto csv15b = "#ab, de\tfg\"\rhij#"; 477 auto csv16b = "#ab, de\tfg\"\r\nhij#"; 478 auto csv17b = "ab\"^ab\"cd"; 479 auto csv18b = "\n\n\n"; 480 auto csv19b = "\t"; 481 auto csv20b = "\t\t"; 482 auto csv21b = "a\n"; 483 auto csv22b = "a^\n"; 484 auto csv23b = "a^b\n"; 485 auto csv24b = "^\n"; 486 auto csv25b = "####"; 487 auto csv26b = "#^#"; 488 auto csv27b = "###^###"; 489 auto csv28b = "#^##^#"; 490 auto csv29b = "$"; 491 auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n"; 492 auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n"; 493 auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#"; 494 495 /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/ 496 auto tsv1 = "a\tb\tc\n"; 497 auto tsv2 = "a\tbc\t\t\tdef\n"; 498 auto tsv3 = "\ta\t b \t cd \t\n"; 499 auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 500 auto tsv5 = " \t \t \n"; 501 auto tsv6 = " \t \t \n"; 502 auto tsv7 = ",\t,,\t,,,\n"; 503 auto tsv8 = "\t\"\t\"\"\n"; 504 auto tsv9 = "ab, de fg\" hij\n"; 505 auto tsv10 = ""; 506 auto tsv11 = "\t\n"; 507 auto tsv12 = "\t\t\n"; 508 auto tsv13 = " \t \t \n"; 509 auto tsv14 = " \t \t \n"; 510 auto tsv15 = "ab, de fg\" hij\n"; 511 auto tsv16 = "ab, de fg\" hij\n"; 512 auto tsv17 = "ab\"\tab\"cd\n"; 513 auto tsv18 = "\n\n\n"; 514 auto tsv19 = " \n"; 515 auto tsv20 = " \n"; 516 auto tsv21 = "a\n"; 517 auto tsv22 = "a\t\n"; 518 auto tsv23 = "a\tb\n"; 519 auto tsv24 = "\t\n"; 520 auto tsv25 = "#\n"; 521 auto tsv26 = "^\n"; 522 auto tsv27 = "#^#\n"; 523 auto tsv28 = "^#^\n"; 524 auto tsv29 = "$\n"; 525 auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 526 auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 527 auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 528 529 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab. 530 * This will also result in different replacements when TAB and $ appear in the CSV. 531 */ 532 auto tsv1_x = "a$b$c\n"; 533 auto tsv2_x = "a$bc$$$def\n"; 534 auto tsv3_x = "$a$ b $ cd $\n"; 535 auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 536 auto tsv5_x = " $ $ \n"; 537 auto tsv6_x = "\t$\t\t$\t\t\t\n"; 538 auto tsv7_x = ",$,,$,,,\n"; 539 auto tsv8_x = "$\"$\"\"\n"; 540 auto tsv9_x = "ab, de\tfg\" hij\n"; 541 auto tsv10_x = ""; 542 auto tsv11_x = "$\n"; 543 auto tsv12_x = "$$\n"; 544 auto tsv13_x = " $ $ \n"; 545 auto tsv14_x = " $ $ \n"; 546 auto tsv15_x = "ab, de\tfg\" hij\n"; 547 auto tsv16_x = "ab, de\tfg\" hij\n"; 548 auto tsv17_x = "ab\"$ab\"cd\n"; 549 auto tsv18_x = "\n\n\n"; 550 auto tsv19_x = "\t\n"; 551 auto tsv20_x = "\t\t\n"; 552 auto tsv21_x = "a\n"; 553 auto tsv22_x = "a$\n"; 554 auto tsv23_x = "a$b\n"; 555 auto tsv24_x = "$\n"; 556 auto tsv25_x = "#\n"; 557 auto tsv26_x = "^\n"; 558 auto tsv27_x = "#^#\n"; 559 auto tsv28_x = "^#^\n"; 560 auto tsv29_x = " \n"; 561 auto tsv30_x = " $ \n $ $ \n^# $ #^$# ^$^ #\n"; 562 auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 563 auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 564 565 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab, 566 * and with the delimiter/newline replacement string being |--|. Basically, newlines 567 * and '$' in the original data are replaced by |--|. 568 */ 569 auto tsv1_y = "a$b$c\n"; 570 auto tsv2_y = "a$bc$$$def\n"; 571 auto tsv3_y = "$a$ b $ cd $\n"; 572 auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 573 auto tsv5_y = "|--|$|--||--|$|--||--||--|\n"; 574 auto tsv6_y = "\t$\t\t$\t\t\t\n"; 575 auto tsv7_y = ",$,,$,,,\n"; 576 auto tsv8_y = "$\"$\"\"\n"; 577 auto tsv9_y = "ab, de\tfg\"|--|hij\n"; 578 auto tsv10_y = ""; 579 auto tsv11_y = "$\n"; 580 auto tsv12_y = "$$\n"; 581 auto tsv13_y = "|--|$|--||--|$|--||--||--|\n"; 582 auto tsv14_y = "|--|$|--||--|$|--||--||--|\n"; 583 auto tsv15_y = "ab, de\tfg\"|--|hij\n"; 584 auto tsv16_y = "ab, de\tfg\"|--|hij\n"; 585 auto tsv17_y = "ab\"$ab\"cd\n"; 586 auto tsv18_y = "\n\n\n"; 587 auto tsv19_y = "\t\n"; 588 auto tsv20_y = "\t\t\n"; 589 auto tsv21_y = "a\n"; 590 auto tsv22_y = "a$\n"; 591 auto tsv23_y = "a$b\n"; 592 auto tsv24_y = "$\n"; 593 auto tsv25_y = "#\n"; 594 auto tsv26_y = "^\n"; 595 auto tsv27_y = "#^#\n"; 596 auto tsv28_y = "^#^\n"; 597 auto tsv29_y = "|--|\n"; 598 auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n"; 599 auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 600 auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 601 602 auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a, 603 csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a, 604 csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a, 605 csv31a, csv32a]; 606 607 auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b, 608 csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b, 609 csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b, 610 csv31b, csv32b]; 611 612 auto tsvSet1 = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10, 613 tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20, 614 tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30, 615 tsv31, tsv32]; 616 617 auto tsvSet1_x = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x, 618 tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x, 619 tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x, 620 tsv31_x, tsv32_x]; 621 622 auto tsvSet1_y = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y, 623 tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y, 624 tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y, 625 tsv31_y, tsv32_y]; 626 627 foreach (i, csva, csvb, tsv, tsv_x, tsv_y; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y)) 628 { 629 import std.conv : to; 630 631 /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 632 ubyte[] csvInputA = cast(ubyte[])csva; 633 ubyte[] csvInputB = cast(ubyte[])csvb; 634 635 /* CSV Set A vs TSV expected. */ 636 auto tsvResultA = appender!(char[])(); 637 csv2tsv(csvInputA, tsvResultA, "csvInputA_defaultTSV", i); 638 assert(tsv == tsvResultA.data, 639 format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 640 i + 1, csva, tsv, tsvResultA.data)); 641 642 /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/ 643 auto tsvResultB = appender!(char[])(); 644 csv2tsv(csvInputB, tsvResultB, "csvInputB_defaultTSV", i, '#', '^'); 645 assert(tsv == tsvResultB.data, 646 format("Unittest failure. tsv != tsvResultB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 647 i + 1, csvb, tsv, tsvResultB.data)); 648 649 /* CSV Set A and TSV with $ separator.*/ 650 csvInputA = cast(ubyte[])csva; 651 auto tsvResult_XA = appender!(char[])(); 652 csv2tsv(csvInputA, tsvResult_XA, "csvInputA_TSV_WithDollarDelimiter", i, '"', ',', '$'); 653 assert(tsv_x == tsvResult_XA.data, 654 format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 655 i + 1, csva, tsv_x, tsvResult_XA.data)); 656 657 /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/ 658 csvInputB = cast(ubyte[])csvb; 659 auto tsvResult_XB = appender!(char[])(); 660 csv2tsv(csvInputB, tsvResult_XB, "csvInputB__TSV_WithDollarDelimiter", i, '#', '^', '$'); 661 assert(tsv_x == tsvResult_XB.data, 662 format("Unittest failure. tsv_x != tsvResult_XB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 663 i + 1, csvb, tsv_x, tsvResult_XB.data)); 664 665 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */ 666 csvInputA = cast(ubyte[])csva; 667 auto tsvResult_YA = appender!(char[])(); 668 csv2tsv(csvInputA, tsvResult_YA, "csvInputA_TSV_WithDollarAndDelimReplacement", i, '"', ',', '$', "|--|"); 669 assert(tsv_y == tsvResult_YA.data, 670 format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 671 i + 1, csva, tsv_y, tsvResult_YA.data)); 672 673 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/ 674 csvInputB = cast(ubyte[])csvb; 675 auto tsvResult_YB = appender!(char[])(); 676 csv2tsv(csvInputB, tsvResult_YB, "csvInputB__TSV_WithDollarAndDelimReplacement", i, '#', '^', '$', "|--|"); 677 assert(tsv_y == tsvResult_YB.data, 678 format("Unittest failure. tsv_y != tsvResult_YB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 679 i + 1, csvb, tsv_y, tsvResult_YB.data)); 680 681 } 682 } 683 684 unittest 685 { 686 /* Unit tests for 'maxRecords' feature of the csv2tsv function. 687 */ 688 689 /* Input CSV. */ 690 auto csv1 = ""; 691 auto csv2 = ","; 692 auto csv3 = "a"; 693 auto csv4 = "a\n"; 694 auto csv5 = "a\nb"; 695 auto csv6 = "a\nb\n"; 696 auto csv7 = "a\nb\nc"; 697 auto csv8 = "a\nb\nc\n"; 698 auto csv9 = "a,aa"; 699 auto csv10 = "a,aa\n"; 700 auto csv11 = "a,aa\nb,bb"; 701 auto csv12 = "a,aa\nb,bb\n"; 702 auto csv13 = "a,aa\nb,bb\nc,cc"; 703 auto csv14 = "a,aa\nb,bb\nc,cc\n"; 704 705 auto csv15 = "\"a\",\"aa\""; 706 auto csv16 = "\"a\",\"aa\"\n"; 707 auto csv17 = "\"a\",\"aa\"\n\"b\",\"bb\""; 708 auto csv18 = "\"a\",\"aa\"\n\"b\",\"bb\"\n"; 709 auto csv19 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\""; 710 auto csv20 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"\n"; 711 712 /* TSV with max 1 record. */ 713 auto tsv1_max1 = ""; 714 auto tsv2_max1 = "\t\n"; 715 auto tsv3_max1 = "a\n"; 716 auto tsv4_max1 = "a\n"; 717 auto tsv5_max1 = "a\n"; 718 auto tsv6_max1 = "a\n"; 719 auto tsv7_max1 = "a\n"; 720 auto tsv8_max1 = "a\n"; 721 auto tsv9_max1 = "a\taa\n"; 722 auto tsv10_max1 = "a\taa\n"; 723 auto tsv11_max1 = "a\taa\n"; 724 auto tsv12_max1 = "a\taa\n"; 725 auto tsv13_max1 = "a\taa\n"; 726 auto tsv14_max1 = "a\taa\n"; 727 728 auto tsv15_max1 = "a\taa\n"; 729 auto tsv16_max1 = "a\taa\n"; 730 auto tsv17_max1 = "a\taa\n"; 731 auto tsv18_max1 = "a\taa\n"; 732 auto tsv19_max1 = "a\taa\n"; 733 auto tsv20_max1 = "a\taa\n"; 734 735 /* Remaining TSV converted after first call. */ 736 auto tsv1_max1_rest = ""; 737 auto tsv2_max1_rest = ""; 738 auto tsv3_max1_rest = ""; 739 auto tsv4_max1_rest = ""; 740 auto tsv5_max1_rest = "b\n"; 741 auto tsv6_max1_rest = "b\n"; 742 auto tsv7_max1_rest = "b\nc\n"; 743 auto tsv8_max1_rest = "b\nc\n"; 744 auto tsv9_max1_rest = ""; 745 auto tsv10_max1_rest = ""; 746 auto tsv11_max1_rest = "b\tbb\n"; 747 auto tsv12_max1_rest = "b\tbb\n"; 748 auto tsv13_max1_rest = "b\tbb\nc\tcc\n"; 749 auto tsv14_max1_rest = "b\tbb\nc\tcc\n"; 750 751 auto tsv15_max1_rest = ""; 752 auto tsv16_max1_rest = ""; 753 auto tsv17_max1_rest = "b\tbb\n"; 754 auto tsv18_max1_rest = "b\tbb\n"; 755 auto tsv19_max1_rest = "b\tbb\nc\tcc\n"; 756 auto tsv20_max1_rest = "b\tbb\nc\tcc\n"; 757 758 /* TSV with max 2 records. */ 759 auto tsv1_max2 = ""; 760 auto tsv2_max2 = "\t\n"; 761 auto tsv3_max2 = "a\n"; 762 auto tsv4_max2 = "a\n"; 763 auto tsv5_max2 = "a\nb\n"; 764 auto tsv6_max2 = "a\nb\n"; 765 auto tsv7_max2 = "a\nb\n"; 766 auto tsv8_max2 = "a\nb\n"; 767 auto tsv9_max2 = "a\taa\n"; 768 auto tsv10_max2 = "a\taa\n"; 769 auto tsv11_max2 = "a\taa\nb\tbb\n"; 770 auto tsv12_max2 = "a\taa\nb\tbb\n"; 771 auto tsv13_max2 = "a\taa\nb\tbb\n"; 772 auto tsv14_max2 = "a\taa\nb\tbb\n"; 773 774 auto tsv15_max2 = "a\taa\n"; 775 auto tsv16_max2 = "a\taa\n"; 776 auto tsv17_max2 = "a\taa\nb\tbb\n"; 777 auto tsv18_max2 = "a\taa\nb\tbb\n"; 778 auto tsv19_max2 = "a\taa\nb\tbb\n"; 779 auto tsv20_max2 = "a\taa\nb\tbb\n"; 780 781 /* Remaining TSV converted after first call. */ 782 auto tsv1_max2_rest = ""; 783 auto tsv2_max2_rest = ""; 784 auto tsv3_max2_rest = ""; 785 auto tsv4_max2_rest = ""; 786 auto tsv5_max2_rest = ""; 787 auto tsv6_max2_rest = ""; 788 auto tsv7_max2_rest = "c\n"; 789 auto tsv8_max2_rest = "c\n"; 790 auto tsv9_max2_rest = ""; 791 auto tsv10_max2_rest = ""; 792 auto tsv11_max2_rest = ""; 793 auto tsv12_max2_rest = ""; 794 auto tsv13_max2_rest = "c\tcc\n"; 795 auto tsv14_max2_rest = "c\tcc\n"; 796 797 auto tsv15_max2_rest = ""; 798 auto tsv16_max2_rest = ""; 799 auto tsv17_max2_rest = ""; 800 auto tsv18_max2_rest = ""; 801 auto tsv19_max2_rest = "c\tcc\n"; 802 auto tsv20_max2_rest = "c\tcc\n"; 803 804 auto csvSet1 = 805 [csv1, csv2, csv3, csv4, csv5, csv6, csv7, 806 csv8, csv9, csv10, csv11, csv12, csv13, csv14, 807 csv15, csv16, csv17, csv18, csv19, csv20 ]; 808 809 auto tsvMax1Set1 = 810 [tsv1_max1, tsv2_max1, tsv3_max1, tsv4_max1, tsv5_max1, tsv6_max1, tsv7_max1, 811 tsv8_max1, tsv9_max1, tsv10_max1, tsv11_max1, tsv12_max1, tsv13_max1, tsv14_max1, 812 tsv15_max1, tsv16_max1, tsv17_max1, tsv18_max1, tsv19_max1, tsv20_max1]; 813 814 auto tsvMax1RestSet1 = 815 [tsv1_max1_rest, tsv2_max1_rest, tsv3_max1_rest, tsv4_max1_rest, tsv5_max1_rest, tsv6_max1_rest, tsv7_max1_rest, 816 tsv8_max1_rest, tsv9_max1_rest, tsv10_max1_rest, tsv11_max1_rest, tsv12_max1_rest, tsv13_max1_rest, tsv14_max1_rest, 817 tsv15_max1_rest, tsv16_max1_rest, tsv17_max1_rest, tsv18_max1_rest, tsv19_max1_rest, tsv20_max1_rest]; 818 819 auto tsvMax2Set1 = 820 [tsv1_max2, tsv2_max2, tsv3_max2, tsv4_max2, tsv5_max2, tsv6_max2, tsv7_max2, 821 tsv8_max2, tsv9_max2, tsv10_max2, tsv11_max2, tsv12_max2, tsv13_max2, tsv14_max2, 822 tsv15_max2, tsv16_max2, tsv17_max2, tsv18_max2, tsv19_max2, tsv20_max2]; 823 824 auto tsvMax2RestSet1 = 825 [tsv1_max2_rest, tsv2_max2_rest, tsv3_max2_rest, tsv4_max2_rest, tsv5_max2_rest, tsv6_max2_rest, tsv7_max2_rest, 826 tsv8_max2_rest, tsv9_max2_rest, tsv10_max2_rest, tsv11_max2_rest, tsv12_max2_rest, tsv13_max2_rest, tsv14_max2_rest, 827 tsv15_max2_rest, tsv16_max2_rest, tsv17_max2_rest, tsv18_max2_rest, tsv19_max2_rest, tsv20_max2_rest]; 828 829 foreach (i, csv, tsv_max1, tsv_max1_rest, tsv_max2, tsv_max2_rest; 830 lockstep(csvSet1, tsvMax1Set1, tsvMax1RestSet1, tsvMax2Set1, tsvMax2RestSet1)) 831 { 832 /* Byte stream for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 833 ubyte[] csvInput = cast(ubyte[])csv; 834 835 /* Call with maxRecords == 1. */ 836 auto tsvMax1Result = appender!(char[])(); 837 csv2tsv(csvInput, tsvMax1Result, "maxRecords-one", i, '"', ',', '\t', " ", NullableSizeT(1)); 838 assert(tsv_max1 == tsvMax1Result.data, 839 format("Unittest failure. tsv_max1 != tsvMax1Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 840 i + 1, csv, tsv_max1, tsvMax1Result.data)); 841 842 /* Follow-up call getting all records remaining after the maxRecords==1 call. */ 843 auto tsvMax1RestResult = appender!(char[])(); 844 csv2tsv(csvInput, tsvMax1RestResult, "maxRecords-one-followup", i); 845 assert(tsv_max1_rest == tsvMax1RestResult.data, 846 format("Unittest failure. tsv_max1_rest != tsvMax1RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 847 i + 1, csv, tsv_max1_rest, tsvMax1RestResult.data)); 848 849 /* Reset the input stream for maxRecords == 2. */ 850 csvInput = cast(ubyte[])csv; 851 852 /* Call with maxRecords == 2. */ 853 auto tsvMax2Result = appender!(char[])(); 854 csv2tsv(csvInput, tsvMax2Result, "maxRecords-two", i, '"', ',', '\t', " ", NullableSizeT(2)); 855 assert(tsv_max2 == tsvMax2Result.data, 856 format("Unittest failure. tsv_max2 != tsvMax2Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 857 i + 1, csv, tsv_max2, tsvMax2Result.data)); 858 859 /* Follow-up call getting all records remaining after the maxRecords==2 call. */ 860 auto tsvMax2RestResult = appender!(char[])(); 861 csv2tsv(csvInput, tsvMax2RestResult, "maxRecords-two-followup", i); 862 assert(tsv_max2_rest == tsvMax2RestResult.data, 863 format("Unittest failure. tsv_max2_rest != tsvMax2RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 864 i + 1, csv, tsv_max2_rest, tsvMax2RestResult.data)); 865 } 866 }