1 /** 2 Convert CSV formatted data to TSV format. 3 4 This program converts comma-separated value data to tab-separated format. 5 6 Copyright (c) 2016-2019, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 12 module tsv_utils.csv2tsv; 13 14 import std.stdio; 15 import std.format : format; 16 import std.range; 17 import std.traits : Unqual; 18 import std.typecons : Nullable, tuple; 19 20 immutable helpText = q"EOS 21 Synopsis: csv2tsv [options] [file...] 22 23 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records 24 are read from files or standard input, converted records written to standard output. 25 Use '--help-verbose' for details the CSV formats accepted. 26 27 Options: 28 EOS"; 29 30 immutable helpTextVerbose = q"EOS 31 Synopsis: csv2tsv [options] [file...] 32 33 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records 34 are read from files or standard input, converted records written to standard output. 35 36 Both formats represent tabular data, each record on its own line, fields separated 37 by a delimiter character. The key difference is that CSV uses escape sequences to 38 represent newlines and field separators in the data, whereas TSV disallows these 39 characters in the data. The most common field delimiters are comma for CSV and tab 40 for TSV, but any character can be used. 41 42 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, 43 and replacing newlines and field delimiters in the data. By default, newlines and 44 field delimiters in the data are replaced by spaces. Most details are customizable. 45 46 There is no single spec for CSV, any number of variants can be found. The escape 47 syntax is common enough: fields containing newlines or field delimiters are placed 48 in double quotes. Inside a quoted field, a double quote is represented by a pair of 49 double quotes. As with field separators, the quoting character is customizable. 50 51 Behaviors of this program that often vary between CSV implementations: 52 * Newlines are supported in quoted fields. 53 * Double quotes are permitted in a non-quoted field. However, a field starting 54 with a quote must follow quoting rules. 55 * Each record can have a different numbers of fields. 56 * The three common forms of newlines are supported: CR, CRLF, LF. 57 * A newline will be added if the file does not end with one. 58 * No whitespace trimming is done. 59 60 This program does not validate CSV correctness, but will terminate with an error 61 upon reaching an inconsistent state. Improperly terminated quoted fields are the 62 primary cause. 63 64 UTF-8 input is assumed. Convert other encodings prior to invoking this tool. 65 66 Options: 67 EOS"; 68 69 /** Container for command line options. 70 */ 71 struct Csv2tsvOptions 72 { 73 string programName; 74 bool helpVerbose = false; // --help-verbose 75 bool hasHeader = false; // --H|header 76 char csvQuoteChar = '"'; // --q|quote 77 char csvDelimChar = ','; // --c|csv-delim 78 char tsvDelimChar = '\t'; // --t|tsv-delim 79 string tsvDelimReplacement = " "; // --r|replacement 80 bool versionWanted = false; // --V|version 81 82 auto processArgs (ref string[] cmdArgs) 83 { 84 import std.algorithm : canFind; 85 import std.getopt; 86 import std.path : baseName, stripExtension; 87 88 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 89 90 try 91 { 92 auto r = getopt( 93 cmdArgs, 94 "help-verbose", " Print full help.", &helpVerbose, 95 std.getopt.config.caseSensitive, 96 "H|header", " Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader, 97 std.getopt.config.caseSensitive, 98 "q|quote", "CHR Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar, 99 "c|csv-delim", "CHR Field delimiter in CSV data. Default: comma (,).", &csvDelimChar, 100 "t|tsv-delim", "CHR Field delimiter in TSV data. Default: TAB", &tsvDelimChar, 101 "r|replacement", "STR Replacement for newline and TSV field delimiters found in CSV input. Default: Space.", &tsvDelimReplacement, 102 std.getopt.config.caseSensitive, 103 "V|version", " Print version information and exit.", &versionWanted, 104 std.getopt.config.caseInsensitive, 105 ); 106 107 if (r.helpWanted) 108 { 109 defaultGetoptPrinter(helpText, r.options); 110 return tuple(false, 0); 111 } 112 else if (helpVerbose) 113 { 114 defaultGetoptPrinter(helpTextVerbose, r.options); 115 return tuple(false, 0); 116 } 117 else if (versionWanted) 118 { 119 import tsv_utils.common.tsvutils_version; 120 writeln(tsvutilsVersionNotice("csv2tsv")); 121 return tuple(false, 0); 122 } 123 124 /* Consistency checks. */ 125 if (csvQuoteChar == '\n' || csvQuoteChar == '\r') 126 { 127 throw new Exception ("CSV quote character cannot be newline (--q|quote)."); 128 } 129 130 if (csvQuoteChar == csvDelimChar) 131 { 132 throw new Exception("CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim)."); 133 } 134 135 if (csvQuoteChar == tsvDelimChar) 136 { 137 throw new Exception("CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim)."); 138 } 139 140 if (csvDelimChar == '\n' || csvDelimChar == '\r') 141 { 142 throw new Exception ("CSV field delimiter cannot be newline (--c|csv-delim)."); 143 } 144 145 if (tsvDelimChar == '\n' || tsvDelimChar == '\r') 146 { 147 throw new Exception ("TSV field delimiter cannot be newline (--t|tsv-delimiter)."); 148 } 149 150 if (canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement)) 151 { 152 throw new Exception ("Replacement character cannot contain newlines or TSV field delimiters (--r|replacement)."); 153 } 154 } 155 catch (Exception exc) 156 { 157 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 158 return tuple(false, 1); 159 } 160 return tuple(true, 0); 161 } 162 } 163 164 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 165 166 version(unittest) 167 { 168 // No main in unittest 169 } 170 else 171 { 172 int main(string[] cmdArgs) 173 { 174 /* When running in DMD code coverage mode, turn on report merging. */ 175 version(D_Coverage) version(DigitalMars) 176 { 177 import core.runtime : dmd_coverSetMerge; 178 dmd_coverSetMerge(true); 179 } 180 181 Csv2tsvOptions cmdopt; 182 const r = cmdopt.processArgs(cmdArgs); 183 if (!r[0]) return r[1]; 184 version(LDC_Profile) 185 { 186 import ldc.profile : resetAll; 187 resetAll(); 188 } 189 try csv2tsvFiles(cmdopt, cmdArgs[1..$]); 190 catch (Exception exc) 191 { 192 writeln(); 193 stdin.flush(); 194 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 195 return 1; 196 } 197 198 return 0; 199 } 200 } 201 202 /* This uses a D feature where a type can reserve a single value to represent null. */ 203 alias NullableSizeT = Nullable!(size_t, size_t.max); 204 205 206 /** csv2tsvFiles reads multiple files and standard input and writes the results to 207 * standard output. 208 */ 209 void csv2tsvFiles(in Csv2tsvOptions cmdopt, in string[] inputFiles) 210 { 211 import std.algorithm : joiner; 212 import tsv_utils.common.utils : BufferedOutputRange; 213 214 ubyte[1024 * 1024] fileRawBuf; 215 ubyte[] stdinRawBuf = fileRawBuf[0..1024]; 216 auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout); 217 bool firstFile = true; 218 219 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 220 { 221 auto ubyteChunkedStream = (filename == "-") ? 222 stdin.byChunk(stdinRawBuf) : filename.File.byChunk(fileRawBuf); 223 auto ubyteStream = ubyteChunkedStream.joiner; 224 225 if (firstFile || !cmdopt.hasHeader) 226 { 227 csv2tsv(ubyteStream, stdoutWriter, filename, 0, 228 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 229 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement); 230 } 231 else 232 { 233 /* Don't write the header on subsequent files. Write the first 234 * record to a null sink instead. 235 */ 236 auto nullWriter = NullSink(); 237 csv2tsv(ubyteStream, nullWriter, filename, 0, 238 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 239 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement, 240 NullableSizeT(1)); 241 csv2tsv(ubyteStream, stdoutWriter, filename, 1, 242 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 243 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement); 244 } 245 firstFile = false; 246 } 247 } 248 249 /** Read CSV from an input source, covert to TSV and write to an output source. 250 * 251 * Params: 252 * InputRange = A ubyte input range to read CSV text from. A ubyte range 253 * matched byChunck. It also avoids convesion to dchar by front(). 254 * OutputRange = An output range to write TSV text to. 255 * filename = Name of file to use when reporting errors. A descriptive name 256 * = can be used in lieu of a file name. 257 * currFileLineNumber = First line being processed. Used when reporting errors. Needed 258 * only when part of the input has already been processed. 259 * csvQuote = The quoting character used in the input CSV file. 260 * csvDelim = The field delimiter character used in the input CSV file. 261 * tsvDelim = The field delimiter character to use in the generated TSV file. 262 * tsvDelimReplacement = A string to use when replacing newlines and TSV field delimiters 263 * occurring in CSV fields. 264 * maxRecords = The maximum number of records to process (output lines). This is 265 * intended to support processing the header line separately. 266 * 267 * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and 268 * line number where the error was identified. 269 */ 270 void csv2tsv(InputRange, OutputRange) 271 (auto ref InputRange inputStream, auto ref OutputRange outputStream, 272 string filename = "(none)", size_t currFileLineNumber = 0, 273 const char csvQuote = '"', const char csvDelim = ',', const char tsvDelim = '\t', 274 string tsvDelimReplacement = " ", 275 NullableSizeT maxRecords=NullableSizeT.init, 276 ) 277 if (isInputRange!InputRange && isOutputRange!(OutputRange, char) && 278 is(Unqual!(ElementType!InputRange) == ubyte)) 279 { 280 enum State { FieldEnd, NonQuotedField, QuotedField, QuoteInQuotedField } 281 282 State currState = State.FieldEnd; 283 size_t recordNum = 1; // Record number. Output line number. 284 size_t fieldNum = 0; // Field on current line. 285 286 InputLoop: while (!inputStream.empty) 287 { 288 char nextChar = inputStream.front; 289 inputStream.popFront; 290 291 if (nextChar == '\r') 292 { 293 /* Collapse newline cases to '\n'. */ 294 if (!inputStream.empty && inputStream.front == '\n') 295 { 296 inputStream.popFront; 297 } 298 nextChar = '\n'; 299 } 300 301 OuterSwitch: final switch (currState) 302 { 303 case State.FieldEnd: 304 /* Start of input or after consuming a field terminator. */ 305 ++fieldNum; 306 307 /* Note: Can't use a switch here do the 'goto case' to the OuterSwitch. */ 308 if (nextChar == csvQuote) 309 { 310 currState = State.QuotedField; 311 break OuterSwitch; 312 } 313 else 314 { 315 /* Processing state change only. Don't consume the character. */ 316 currState = State.NonQuotedField; 317 goto case State.NonQuotedField; 318 } 319 320 case State.NonQuotedField: 321 switch (nextChar) 322 { 323 default: 324 put(outputStream, nextChar); 325 break OuterSwitch; 326 case csvDelim: 327 put(outputStream, tsvDelim); 328 currState = State.FieldEnd; 329 break OuterSwitch; 330 case tsvDelim: 331 put(outputStream, tsvDelimReplacement); 332 break OuterSwitch; 333 case '\n': 334 put(outputStream, '\n'); 335 ++recordNum; 336 fieldNum = 0; 337 currState = State.FieldEnd; 338 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop; 339 else break OuterSwitch; 340 } 341 342 case State.QuotedField: 343 switch (nextChar) 344 { 345 default: 346 put(outputStream, nextChar); 347 break OuterSwitch; 348 case csvQuote: 349 /* Quote in a quoted field. Need to look at the next character.*/ 350 if (!inputStream.empty) 351 { 352 currState = State.QuoteInQuotedField; 353 } 354 else 355 { 356 /* End of input. A rare case: Quoted field on last line with no 357 * following trailing newline. Reset the state to avoid triggering 358 * an invalid quoted field exception, plus adding additional newline. 359 */ 360 currState = State.FieldEnd; 361 } 362 break OuterSwitch; 363 case '\n': 364 /* Newline in a quoted field. */ 365 put(outputStream, tsvDelimReplacement); 366 break OuterSwitch; 367 case tsvDelim: 368 put(outputStream, tsvDelimReplacement); 369 break OuterSwitch; 370 } 371 372 case State.QuoteInQuotedField: 373 /* Just processed a quote in a quoted field. */ 374 switch (nextChar) 375 { 376 case csvQuote: 377 put(outputStream, csvQuote); 378 currState = State.QuotedField; 379 break OuterSwitch; 380 case csvDelim: 381 put(outputStream, tsvDelim); 382 currState = State.FieldEnd; 383 break OuterSwitch; 384 case '\n': 385 put(outputStream, '\n'); 386 ++recordNum; 387 fieldNum = 0; 388 currState = State.FieldEnd; 389 390 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop; 391 else break OuterSwitch; 392 default: 393 throw new Exception( 394 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 395 (filename == "-") ? "Standard Input" : filename, 396 currFileLineNumber + recordNum)); 397 } 398 } 399 } 400 401 if (currState == State.QuotedField) 402 { 403 throw new Exception( 404 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 405 (filename == "-") ? "Standard Input" : filename, 406 currFileLineNumber + recordNum)); 407 } 408 409 if (fieldNum > 0) put(outputStream, '\n'); // Last line w/o terminating newline. 410 } 411 412 unittest 413 { 414 /* Unit tests for the csv2tsv function. 415 * 416 * These unit tests exercise different CSV combinations and escaping cases. The CSV 417 * data content is the same for each corresponding test string, except the delimiters 418 * have been changed. e.g csv6a and csv6b have the same data content. 419 * 420 * A property used in these tests is that changing the CSV delimiters doesn't change 421 * the resulting TSV. However, changing the TSV delimiters will change the TSV result, 422 * as TSV doesn't support having it's delimiters in the data. This allows having a 423 * single TSV expected set that is generated by CSVs with different delimter sets. 424 * 425 * This test set does not test main, file handling, or error messages. These are 426 * handled by tests run against the executable. 427 */ 428 429 /* Default CSV. */ 430 auto csv1a = "a,b,c"; 431 auto csv2a = "a,bc,,,def"; 432 auto csv3a = ",a, b , cd ,"; 433 auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石"; 434 auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\""; 435 auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\""; 436 auto csv7a = "\",\",\",,\",\",,,\""; 437 auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\""; 438 auto csv9a = "\"ab, de\tfg\"\"\nhij\""; 439 auto csv10a = ""; 440 auto csv11a = ","; 441 auto csv12a = ",,"; 442 auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\""; 443 auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\""; 444 auto csv15a = "\"ab, de\tfg\"\"\rhij\""; 445 auto csv16a = "\"ab, de\tfg\"\"\r\nhij\""; 446 auto csv17a = "ab\",ab\"cd"; 447 auto csv18a = "\n\n\n"; 448 auto csv19a = "\t"; 449 auto csv20a = "\t\t"; 450 auto csv21a = "a\n"; 451 auto csv22a = "a,\n"; 452 auto csv23a = "a,b\n"; 453 auto csv24a = ",\n"; 454 auto csv25a = "#"; 455 auto csv26a = "^"; 456 auto csv27a = "#^#"; 457 auto csv28a = "^#^"; 458 auto csv29a = "$"; 459 auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n"; 460 auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n"; 461 auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\""; 462 463 /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */ 464 auto csv1b = "a^b^c"; 465 auto csv2b = "a^bc^^^def"; 466 auto csv3b = "^a^ b ^ cd ^"; 467 auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石"; 468 auto csv5b = "#\n#^#\n\n#^#\n\n\n#"; 469 auto csv6b = "#\t#^#\t\t#^#\t\t\t#"; 470 auto csv7b = "#,#^#,,#^#,,,#"; 471 auto csv8b = "##^#\"#^#\"\"#"; 472 auto csv9b = "#ab, de\tfg\"\nhij#"; 473 auto csv10b = ""; 474 auto csv11b = "^"; 475 auto csv12b = "^^"; 476 auto csv13b = "#\r#^#\r\r#^#\r\r\r#"; 477 auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#"; 478 auto csv15b = "#ab, de\tfg\"\rhij#"; 479 auto csv16b = "#ab, de\tfg\"\r\nhij#"; 480 auto csv17b = "ab\"^ab\"cd"; 481 auto csv18b = "\n\n\n"; 482 auto csv19b = "\t"; 483 auto csv20b = "\t\t"; 484 auto csv21b = "a\n"; 485 auto csv22b = "a^\n"; 486 auto csv23b = "a^b\n"; 487 auto csv24b = "^\n"; 488 auto csv25b = "####"; 489 auto csv26b = "#^#"; 490 auto csv27b = "###^###"; 491 auto csv28b = "#^##^#"; 492 auto csv29b = "$"; 493 auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n"; 494 auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n"; 495 auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#"; 496 497 /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/ 498 auto tsv1 = "a\tb\tc\n"; 499 auto tsv2 = "a\tbc\t\t\tdef\n"; 500 auto tsv3 = "\ta\t b \t cd \t\n"; 501 auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 502 auto tsv5 = " \t \t \n"; 503 auto tsv6 = " \t \t \n"; 504 auto tsv7 = ",\t,,\t,,,\n"; 505 auto tsv8 = "\t\"\t\"\"\n"; 506 auto tsv9 = "ab, de fg\" hij\n"; 507 auto tsv10 = ""; 508 auto tsv11 = "\t\n"; 509 auto tsv12 = "\t\t\n"; 510 auto tsv13 = " \t \t \n"; 511 auto tsv14 = " \t \t \n"; 512 auto tsv15 = "ab, de fg\" hij\n"; 513 auto tsv16 = "ab, de fg\" hij\n"; 514 auto tsv17 = "ab\"\tab\"cd\n"; 515 auto tsv18 = "\n\n\n"; 516 auto tsv19 = " \n"; 517 auto tsv20 = " \n"; 518 auto tsv21 = "a\n"; 519 auto tsv22 = "a\t\n"; 520 auto tsv23 = "a\tb\n"; 521 auto tsv24 = "\t\n"; 522 auto tsv25 = "#\n"; 523 auto tsv26 = "^\n"; 524 auto tsv27 = "#^#\n"; 525 auto tsv28 = "^#^\n"; 526 auto tsv29 = "$\n"; 527 auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 528 auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 529 auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 530 531 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab. 532 * This will also result in different replacements when TAB and $ appear in the CSV. 533 */ 534 auto tsv1_x = "a$b$c\n"; 535 auto tsv2_x = "a$bc$$$def\n"; 536 auto tsv3_x = "$a$ b $ cd $\n"; 537 auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 538 auto tsv5_x = " $ $ \n"; 539 auto tsv6_x = "\t$\t\t$\t\t\t\n"; 540 auto tsv7_x = ",$,,$,,,\n"; 541 auto tsv8_x = "$\"$\"\"\n"; 542 auto tsv9_x = "ab, de\tfg\" hij\n"; 543 auto tsv10_x = ""; 544 auto tsv11_x = "$\n"; 545 auto tsv12_x = "$$\n"; 546 auto tsv13_x = " $ $ \n"; 547 auto tsv14_x = " $ $ \n"; 548 auto tsv15_x = "ab, de\tfg\" hij\n"; 549 auto tsv16_x = "ab, de\tfg\" hij\n"; 550 auto tsv17_x = "ab\"$ab\"cd\n"; 551 auto tsv18_x = "\n\n\n"; 552 auto tsv19_x = "\t\n"; 553 auto tsv20_x = "\t\t\n"; 554 auto tsv21_x = "a\n"; 555 auto tsv22_x = "a$\n"; 556 auto tsv23_x = "a$b\n"; 557 auto tsv24_x = "$\n"; 558 auto tsv25_x = "#\n"; 559 auto tsv26_x = "^\n"; 560 auto tsv27_x = "#^#\n"; 561 auto tsv28_x = "^#^\n"; 562 auto tsv29_x = " \n"; 563 auto tsv30_x = " $ \n $ $ \n^# $ #^$# ^$^ #\n"; 564 auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 565 auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 566 567 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab, 568 * and with the delimiter/newline replacement string being |--|. Basically, newlines 569 * and '$' in the original data are replaced by |--|. 570 */ 571 auto tsv1_y = "a$b$c\n"; 572 auto tsv2_y = "a$bc$$$def\n"; 573 auto tsv3_y = "$a$ b $ cd $\n"; 574 auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 575 auto tsv5_y = "|--|$|--||--|$|--||--||--|\n"; 576 auto tsv6_y = "\t$\t\t$\t\t\t\n"; 577 auto tsv7_y = ",$,,$,,,\n"; 578 auto tsv8_y = "$\"$\"\"\n"; 579 auto tsv9_y = "ab, de\tfg\"|--|hij\n"; 580 auto tsv10_y = ""; 581 auto tsv11_y = "$\n"; 582 auto tsv12_y = "$$\n"; 583 auto tsv13_y = "|--|$|--||--|$|--||--||--|\n"; 584 auto tsv14_y = "|--|$|--||--|$|--||--||--|\n"; 585 auto tsv15_y = "ab, de\tfg\"|--|hij\n"; 586 auto tsv16_y = "ab, de\tfg\"|--|hij\n"; 587 auto tsv17_y = "ab\"$ab\"cd\n"; 588 auto tsv18_y = "\n\n\n"; 589 auto tsv19_y = "\t\n"; 590 auto tsv20_y = "\t\t\n"; 591 auto tsv21_y = "a\n"; 592 auto tsv22_y = "a$\n"; 593 auto tsv23_y = "a$b\n"; 594 auto tsv24_y = "$\n"; 595 auto tsv25_y = "#\n"; 596 auto tsv26_y = "^\n"; 597 auto tsv27_y = "#^#\n"; 598 auto tsv28_y = "^#^\n"; 599 auto tsv29_y = "|--|\n"; 600 auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n"; 601 auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 602 auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 603 604 auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a, 605 csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a, 606 csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a, 607 csv31a, csv32a]; 608 609 auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b, 610 csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b, 611 csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b, 612 csv31b, csv32b]; 613 614 auto tsvSet1 = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10, 615 tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20, 616 tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30, 617 tsv31, tsv32]; 618 619 auto tsvSet1_x = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x, 620 tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x, 621 tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x, 622 tsv31_x, tsv32_x]; 623 624 auto tsvSet1_y = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y, 625 tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y, 626 tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y, 627 tsv31_y, tsv32_y]; 628 629 foreach (i, csva, csvb, tsv, tsv_x, tsv_y; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y)) 630 { 631 import std.conv : to; 632 633 /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 634 ubyte[] csvInputA = cast(ubyte[])csva; 635 ubyte[] csvInputB = cast(ubyte[])csvb; 636 637 /* CSV Set A vs TSV expected. */ 638 auto tsvResultA = appender!(char[])(); 639 csv2tsv(csvInputA, tsvResultA, "csvInputA_defaultTSV", i); 640 assert(tsv == tsvResultA.data, 641 format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 642 i + 1, csva, tsv, tsvResultA.data)); 643 644 /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/ 645 auto tsvResultB = appender!(char[])(); 646 csv2tsv(csvInputB, tsvResultB, "csvInputB_defaultTSV", i, '#', '^'); 647 assert(tsv == tsvResultB.data, 648 format("Unittest failure. tsv != tsvResultB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 649 i + 1, csvb, tsv, tsvResultB.data)); 650 651 /* CSV Set A and TSV with $ separator.*/ 652 csvInputA = cast(ubyte[])csva; 653 auto tsvResult_XA = appender!(char[])(); 654 csv2tsv(csvInputA, tsvResult_XA, "csvInputA_TSV_WithDollarDelimiter", i, '"', ',', '$'); 655 assert(tsv_x == tsvResult_XA.data, 656 format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 657 i + 1, csva, tsv_x, tsvResult_XA.data)); 658 659 /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/ 660 csvInputB = cast(ubyte[])csvb; 661 auto tsvResult_XB = appender!(char[])(); 662 csv2tsv(csvInputB, tsvResult_XB, "csvInputB__TSV_WithDollarDelimiter", i, '#', '^', '$'); 663 assert(tsv_x == tsvResult_XB.data, 664 format("Unittest failure. tsv_x != tsvResult_XB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 665 i + 1, csvb, tsv_x, tsvResult_XB.data)); 666 667 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */ 668 csvInputA = cast(ubyte[])csva; 669 auto tsvResult_YA = appender!(char[])(); 670 csv2tsv(csvInputA, tsvResult_YA, "csvInputA_TSV_WithDollarAndDelimReplacement", i, '"', ',', '$', "|--|"); 671 assert(tsv_y == tsvResult_YA.data, 672 format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 673 i + 1, csva, tsv_y, tsvResult_YA.data)); 674 675 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/ 676 csvInputB = cast(ubyte[])csvb; 677 auto tsvResult_YB = appender!(char[])(); 678 csv2tsv(csvInputB, tsvResult_YB, "csvInputB__TSV_WithDollarAndDelimReplacement", i, '#', '^', '$', "|--|"); 679 assert(tsv_y == tsvResult_YB.data, 680 format("Unittest failure. tsv_y != tsvResult_YB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 681 i + 1, csvb, tsv_y, tsvResult_YB.data)); 682 683 } 684 } 685 686 unittest 687 { 688 /* Unit tests for 'maxRecords' feature of the csv2tsv function. 689 */ 690 691 /* Input CSV. */ 692 auto csv1 = ""; 693 auto csv2 = ","; 694 auto csv3 = "a"; 695 auto csv4 = "a\n"; 696 auto csv5 = "a\nb"; 697 auto csv6 = "a\nb\n"; 698 auto csv7 = "a\nb\nc"; 699 auto csv8 = "a\nb\nc\n"; 700 auto csv9 = "a,aa"; 701 auto csv10 = "a,aa\n"; 702 auto csv11 = "a,aa\nb,bb"; 703 auto csv12 = "a,aa\nb,bb\n"; 704 auto csv13 = "a,aa\nb,bb\nc,cc"; 705 auto csv14 = "a,aa\nb,bb\nc,cc\n"; 706 707 auto csv15 = "\"a\",\"aa\""; 708 auto csv16 = "\"a\",\"aa\"\n"; 709 auto csv17 = "\"a\",\"aa\"\n\"b\",\"bb\""; 710 auto csv18 = "\"a\",\"aa\"\n\"b\",\"bb\"\n"; 711 auto csv19 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\""; 712 auto csv20 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"\n"; 713 714 /* TSV with max 1 record. */ 715 auto tsv1_max1 = ""; 716 auto tsv2_max1 = "\t\n"; 717 auto tsv3_max1 = "a\n"; 718 auto tsv4_max1 = "a\n"; 719 auto tsv5_max1 = "a\n"; 720 auto tsv6_max1 = "a\n"; 721 auto tsv7_max1 = "a\n"; 722 auto tsv8_max1 = "a\n"; 723 auto tsv9_max1 = "a\taa\n"; 724 auto tsv10_max1 = "a\taa\n"; 725 auto tsv11_max1 = "a\taa\n"; 726 auto tsv12_max1 = "a\taa\n"; 727 auto tsv13_max1 = "a\taa\n"; 728 auto tsv14_max1 = "a\taa\n"; 729 730 auto tsv15_max1 = "a\taa\n"; 731 auto tsv16_max1 = "a\taa\n"; 732 auto tsv17_max1 = "a\taa\n"; 733 auto tsv18_max1 = "a\taa\n"; 734 auto tsv19_max1 = "a\taa\n"; 735 auto tsv20_max1 = "a\taa\n"; 736 737 /* Remaining TSV converted after first call. */ 738 auto tsv1_max1_rest = ""; 739 auto tsv2_max1_rest = ""; 740 auto tsv3_max1_rest = ""; 741 auto tsv4_max1_rest = ""; 742 auto tsv5_max1_rest = "b\n"; 743 auto tsv6_max1_rest = "b\n"; 744 auto tsv7_max1_rest = "b\nc\n"; 745 auto tsv8_max1_rest = "b\nc\n"; 746 auto tsv9_max1_rest = ""; 747 auto tsv10_max1_rest = ""; 748 auto tsv11_max1_rest = "b\tbb\n"; 749 auto tsv12_max1_rest = "b\tbb\n"; 750 auto tsv13_max1_rest = "b\tbb\nc\tcc\n"; 751 auto tsv14_max1_rest = "b\tbb\nc\tcc\n"; 752 753 auto tsv15_max1_rest = ""; 754 auto tsv16_max1_rest = ""; 755 auto tsv17_max1_rest = "b\tbb\n"; 756 auto tsv18_max1_rest = "b\tbb\n"; 757 auto tsv19_max1_rest = "b\tbb\nc\tcc\n"; 758 auto tsv20_max1_rest = "b\tbb\nc\tcc\n"; 759 760 /* TSV with max 2 records. */ 761 auto tsv1_max2 = ""; 762 auto tsv2_max2 = "\t\n"; 763 auto tsv3_max2 = "a\n"; 764 auto tsv4_max2 = "a\n"; 765 auto tsv5_max2 = "a\nb\n"; 766 auto tsv6_max2 = "a\nb\n"; 767 auto tsv7_max2 = "a\nb\n"; 768 auto tsv8_max2 = "a\nb\n"; 769 auto tsv9_max2 = "a\taa\n"; 770 auto tsv10_max2 = "a\taa\n"; 771 auto tsv11_max2 = "a\taa\nb\tbb\n"; 772 auto tsv12_max2 = "a\taa\nb\tbb\n"; 773 auto tsv13_max2 = "a\taa\nb\tbb\n"; 774 auto tsv14_max2 = "a\taa\nb\tbb\n"; 775 776 auto tsv15_max2 = "a\taa\n"; 777 auto tsv16_max2 = "a\taa\n"; 778 auto tsv17_max2 = "a\taa\nb\tbb\n"; 779 auto tsv18_max2 = "a\taa\nb\tbb\n"; 780 auto tsv19_max2 = "a\taa\nb\tbb\n"; 781 auto tsv20_max2 = "a\taa\nb\tbb\n"; 782 783 /* Remaining TSV converted after first call. */ 784 auto tsv1_max2_rest = ""; 785 auto tsv2_max2_rest = ""; 786 auto tsv3_max2_rest = ""; 787 auto tsv4_max2_rest = ""; 788 auto tsv5_max2_rest = ""; 789 auto tsv6_max2_rest = ""; 790 auto tsv7_max2_rest = "c\n"; 791 auto tsv8_max2_rest = "c\n"; 792 auto tsv9_max2_rest = ""; 793 auto tsv10_max2_rest = ""; 794 auto tsv11_max2_rest = ""; 795 auto tsv12_max2_rest = ""; 796 auto tsv13_max2_rest = "c\tcc\n"; 797 auto tsv14_max2_rest = "c\tcc\n"; 798 799 auto tsv15_max2_rest = ""; 800 auto tsv16_max2_rest = ""; 801 auto tsv17_max2_rest = ""; 802 auto tsv18_max2_rest = ""; 803 auto tsv19_max2_rest = "c\tcc\n"; 804 auto tsv20_max2_rest = "c\tcc\n"; 805 806 auto csvSet1 = 807 [csv1, csv2, csv3, csv4, csv5, csv6, csv7, 808 csv8, csv9, csv10, csv11, csv12, csv13, csv14, 809 csv15, csv16, csv17, csv18, csv19, csv20 ]; 810 811 auto tsvMax1Set1 = 812 [tsv1_max1, tsv2_max1, tsv3_max1, tsv4_max1, tsv5_max1, tsv6_max1, tsv7_max1, 813 tsv8_max1, tsv9_max1, tsv10_max1, tsv11_max1, tsv12_max1, tsv13_max1, tsv14_max1, 814 tsv15_max1, tsv16_max1, tsv17_max1, tsv18_max1, tsv19_max1, tsv20_max1]; 815 816 auto tsvMax1RestSet1 = 817 [tsv1_max1_rest, tsv2_max1_rest, tsv3_max1_rest, tsv4_max1_rest, tsv5_max1_rest, tsv6_max1_rest, tsv7_max1_rest, 818 tsv8_max1_rest, tsv9_max1_rest, tsv10_max1_rest, tsv11_max1_rest, tsv12_max1_rest, tsv13_max1_rest, tsv14_max1_rest, 819 tsv15_max1_rest, tsv16_max1_rest, tsv17_max1_rest, tsv18_max1_rest, tsv19_max1_rest, tsv20_max1_rest]; 820 821 auto tsvMax2Set1 = 822 [tsv1_max2, tsv2_max2, tsv3_max2, tsv4_max2, tsv5_max2, tsv6_max2, tsv7_max2, 823 tsv8_max2, tsv9_max2, tsv10_max2, tsv11_max2, tsv12_max2, tsv13_max2, tsv14_max2, 824 tsv15_max2, tsv16_max2, tsv17_max2, tsv18_max2, tsv19_max2, tsv20_max2]; 825 826 auto tsvMax2RestSet1 = 827 [tsv1_max2_rest, tsv2_max2_rest, tsv3_max2_rest, tsv4_max2_rest, tsv5_max2_rest, tsv6_max2_rest, tsv7_max2_rest, 828 tsv8_max2_rest, tsv9_max2_rest, tsv10_max2_rest, tsv11_max2_rest, tsv12_max2_rest, tsv13_max2_rest, tsv14_max2_rest, 829 tsv15_max2_rest, tsv16_max2_rest, tsv17_max2_rest, tsv18_max2_rest, tsv19_max2_rest, tsv20_max2_rest]; 830 831 foreach (i, csv, tsv_max1, tsv_max1_rest, tsv_max2, tsv_max2_rest; 832 lockstep(csvSet1, tsvMax1Set1, tsvMax1RestSet1, tsvMax2Set1, tsvMax2RestSet1)) 833 { 834 /* Byte stream for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 835 ubyte[] csvInput = cast(ubyte[])csv; 836 837 /* Call with maxRecords == 1. */ 838 auto tsvMax1Result = appender!(char[])(); 839 csv2tsv(csvInput, tsvMax1Result, "maxRecords-one", i, '"', ',', '\t', " ", NullableSizeT(1)); 840 assert(tsv_max1 == tsvMax1Result.data, 841 format("Unittest failure. tsv_max1 != tsvMax1Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 842 i + 1, csv, tsv_max1, tsvMax1Result.data)); 843 844 /* Follow-up call getting all records remaining after the maxRecords==1 call. */ 845 auto tsvMax1RestResult = appender!(char[])(); 846 csv2tsv(csvInput, tsvMax1RestResult, "maxRecords-one-followup", i); 847 assert(tsv_max1_rest == tsvMax1RestResult.data, 848 format("Unittest failure. tsv_max1_rest != tsvMax1RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 849 i + 1, csv, tsv_max1_rest, tsvMax1RestResult.data)); 850 851 /* Reset the input stream for maxRecords == 2. */ 852 csvInput = cast(ubyte[])csv; 853 854 /* Call with maxRecords == 2. */ 855 auto tsvMax2Result = appender!(char[])(); 856 csv2tsv(csvInput, tsvMax2Result, "maxRecords-two", i, '"', ',', '\t', " ", NullableSizeT(2)); 857 assert(tsv_max2 == tsvMax2Result.data, 858 format("Unittest failure. tsv_max2 != tsvMax2Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 859 i + 1, csv, tsv_max2, tsvMax2Result.data)); 860 861 /* Follow-up call getting all records remaining after the maxRecords==2 call. */ 862 auto tsvMax2RestResult = appender!(char[])(); 863 csv2tsv(csvInput, tsvMax2RestResult, "maxRecords-two-followup", i); 864 assert(tsv_max2_rest == tsvMax2RestResult.data, 865 format("Unittest failure. tsv_max2_rest != tsvMax2RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 866 i + 1, csv, tsv_max2_rest, tsvMax2RestResult.data)); 867 } 868 }