1 /** 2 Convert CSV formatted data to TSV format. 3 4 This program converts comma-separated value data to tab-separated format. 5 6 Copyright (c) 2016-2020, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 12 module tsv_utils.csv2tsv; 13 14 import std.stdio; 15 import std.exception : enforce; 16 import std.format : format; 17 import std.range; 18 import std.traits : Unqual; 19 import std.typecons : Nullable, tuple; 20 21 immutable helpText = q"EOS 22 Synopsis: csv2tsv [options] [file...] 23 24 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records 25 are read from files or standard input, converted records written to standard output. 26 Use '--help-verbose' for details the CSV formats accepted. 27 28 Options: 29 EOS"; 30 31 immutable helpTextVerbose = q"EOS 32 Synopsis: csv2tsv [options] [file...] 33 34 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records 35 are read from files or standard input, converted records written to standard output. 36 37 Both formats represent tabular data, each record on its own line, fields separated 38 by a delimiter character. The key difference is that CSV uses escape sequences to 39 represent newlines and field separators in the data, whereas TSV disallows these 40 characters in the data. The most common field delimiters are comma for CSV and tab 41 for TSV, but any character can be used. 42 43 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, 44 and replacing newlines and field delimiters in the data. By default, newlines and 45 field delimiters in the data are replaced by spaces. Most details are customizable. 46 47 There is no single spec for CSV, any number of variants can be found. The escape 48 syntax is common enough: fields containing newlines or field delimiters are placed 49 in double quotes. Inside a quoted field, a double quote is represented by a pair of 50 double quotes. As with field separators, the quoting character is customizable. 51 52 Behaviors of this program that often vary between CSV implementations: 53 * Newlines are supported in quoted fields. 54 * Double quotes are permitted in a non-quoted field. However, a field starting 55 with a quote must follow quoting rules. 56 * Each record can have a different numbers of fields. 57 * The three common forms of newlines are supported: CR, CRLF, LF. 58 * A newline will be added if the file does not end with one. 59 * No whitespace trimming is done. 60 61 This program does not validate CSV correctness, but will terminate with an error 62 upon reaching an inconsistent state. Improperly terminated quoted fields are the 63 primary cause. 64 65 UTF-8 input is assumed. Convert other encodings prior to invoking this tool. 66 67 Options: 68 EOS"; 69 70 /** Container for command line options. 71 */ 72 struct Csv2tsvOptions 73 { 74 string programName; 75 bool helpVerbose = false; // --help-verbose 76 bool hasHeader = false; // --H|header 77 char csvQuoteChar = '"'; // --q|quote 78 char csvDelimChar = ','; // --c|csv-delim 79 char tsvDelimChar = '\t'; // --t|tsv-delim 80 string tsvDelimReplacement = " "; // --r|replacement 81 bool versionWanted = false; // --V|version 82 83 auto processArgs (ref string[] cmdArgs) 84 { 85 import std.algorithm : canFind; 86 import std.getopt; 87 import std.path : baseName, stripExtension; 88 89 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 90 91 try 92 { 93 auto r = getopt( 94 cmdArgs, 95 "help-verbose", " Print full help.", &helpVerbose, 96 std.getopt.config.caseSensitive, 97 "H|header", " Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader, 98 std.getopt.config.caseSensitive, 99 "q|quote", "CHR Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar, 100 "c|csv-delim", "CHR Field delimiter in CSV data. Default: comma (,).", &csvDelimChar, 101 "t|tsv-delim", "CHR Field delimiter in TSV data. Default: TAB", &tsvDelimChar, 102 "r|replacement", "STR Replacement for newline and TSV field delimiters found in CSV input. Default: Space.", &tsvDelimReplacement, 103 std.getopt.config.caseSensitive, 104 "V|version", " Print version information and exit.", &versionWanted, 105 std.getopt.config.caseInsensitive, 106 ); 107 108 if (r.helpWanted) 109 { 110 defaultGetoptPrinter(helpText, r.options); 111 return tuple(false, 0); 112 } 113 else if (helpVerbose) 114 { 115 defaultGetoptPrinter(helpTextVerbose, r.options); 116 return tuple(false, 0); 117 } 118 else if (versionWanted) 119 { 120 import tsv_utils.common.tsvutils_version; 121 writeln(tsvutilsVersionNotice("csv2tsv")); 122 return tuple(false, 0); 123 } 124 125 /* Consistency checks. */ 126 enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r', 127 "CSV quote character cannot be newline (--q|quote)."); 128 129 enforce(csvQuoteChar != csvDelimChar, 130 "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim)."); 131 132 enforce(csvQuoteChar != tsvDelimChar, 133 "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim)."); 134 135 enforce(csvDelimChar != '\n' && csvDelimChar != '\r', 136 "CSV field delimiter cannot be newline (--c|csv-delim)."); 137 138 enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r', 139 "TSV field delimiter cannot be newline (--t|tsv-delim)."); 140 141 enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement), 142 "Replacement character cannot contain newlines or TSV field delimiters (--r|replacement)."); 143 } 144 catch (Exception exc) 145 { 146 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 147 return tuple(false, 1); 148 } 149 return tuple(true, 0); 150 } 151 } 152 153 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 154 155 version(unittest) 156 { 157 // No main in unittest 158 } 159 else 160 { 161 int main(string[] cmdArgs) 162 { 163 /* When running in DMD code coverage mode, turn on report merging. */ 164 version(D_Coverage) version(DigitalMars) 165 { 166 import core.runtime : dmd_coverSetMerge; 167 dmd_coverSetMerge(true); 168 } 169 170 Csv2tsvOptions cmdopt; 171 const r = cmdopt.processArgs(cmdArgs); 172 if (!r[0]) return r[1]; 173 version(LDC_Profile) 174 { 175 import ldc.profile : resetAll; 176 resetAll(); 177 } 178 try csv2tsvFiles(cmdopt, cmdArgs[1..$]); 179 catch (Exception exc) 180 { 181 writeln(); 182 stdin.flush(); 183 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 184 return 1; 185 } 186 187 return 0; 188 } 189 } 190 191 /* This uses a D feature where a type can reserve a single value to represent null. */ 192 alias NullableSizeT = Nullable!(size_t, size_t.max); 193 194 195 /** csv2tsvFiles reads multiple files and standard input and writes the results to 196 * standard output. 197 */ 198 void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles) 199 { 200 import std.algorithm : joiner; 201 import tsv_utils.common.utils : BufferedOutputRange; 202 203 ubyte[1024 * 128] fileRawBuf; 204 ubyte[] stdinRawBuf = fileRawBuf[0..1024]; 205 auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout); 206 bool firstFile = true; 207 208 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 209 { 210 auto ubyteChunkedStream = (filename == "-") ? 211 stdin.byChunk(stdinRawBuf) : filename.File.byChunk(fileRawBuf); 212 auto ubyteStream = ubyteChunkedStream.joiner; 213 214 if (firstFile || !cmdopt.hasHeader) 215 { 216 csv2tsv(ubyteStream, stdoutWriter, filename, 0, 217 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 218 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement); 219 } 220 else 221 { 222 /* Don't write the header on subsequent files. Write the first 223 * record to a null sink instead. 224 */ 225 auto nullWriter = NullSink(); 226 csv2tsv(ubyteStream, nullWriter, filename, 0, 227 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 228 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement, 229 NullableSizeT(1)); 230 csv2tsv(ubyteStream, stdoutWriter, filename, 1, 231 cmdopt.csvQuoteChar, cmdopt.csvDelimChar, 232 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement); 233 } 234 firstFile = false; 235 } 236 } 237 238 /** Read CSV from an input source, covert to TSV and write to an output source. 239 * 240 * Params: 241 * InputRange = A ubyte input range to read CSV text from. A ubyte range 242 * matched byChunck. It also avoids convesion to dchar by front(). 243 * OutputRange = An output range to write TSV text to. 244 * filename = Name of file to use when reporting errors. A descriptive name 245 * = can be used in lieu of a file name. 246 * currFileLineNumber = First line being processed. Used when reporting errors. Needed 247 * only when part of the input has already been processed. 248 * csvQuote = The quoting character used in the input CSV file. 249 * csvDelim = The field delimiter character used in the input CSV file. 250 * tsvDelim = The field delimiter character to use in the generated TSV file. 251 * tsvDelimReplacement = A string to use when replacing newlines and TSV field delimiters 252 * occurring in CSV fields. 253 * maxRecords = The maximum number of records to process (output lines). This is 254 * intended to support processing the header line separately. 255 * 256 * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and 257 * line number where the error was identified. 258 */ 259 void csv2tsv(InputRange, OutputRange) 260 (auto ref InputRange inputStream, auto ref OutputRange outputStream, 261 string filename = "(none)", size_t currFileLineNumber = 0, 262 const char csvQuote = '"', const char csvDelim = ',', const char tsvDelim = '\t', 263 string tsvDelimReplacement = " ", 264 NullableSizeT maxRecords=NullableSizeT.init, 265 ) 266 if (isInputRange!InputRange && isOutputRange!(OutputRange, char) && 267 is(Unqual!(ElementType!InputRange) == ubyte)) 268 { 269 enum State { FieldEnd, NonQuotedField, QuotedField, QuoteInQuotedField } 270 271 State currState = State.FieldEnd; 272 size_t recordNum = 1; // Record number. Output line number. 273 size_t fieldNum = 0; // Field on current line. 274 275 InputLoop: while (!inputStream.empty) 276 { 277 char nextChar = inputStream.front; 278 inputStream.popFront; 279 280 if (nextChar == '\r') 281 { 282 /* Collapse newline cases to '\n'. */ 283 if (!inputStream.empty && inputStream.front == '\n') 284 { 285 inputStream.popFront; 286 } 287 nextChar = '\n'; 288 } 289 290 OuterSwitch: final switch (currState) 291 { 292 case State.FieldEnd: 293 /* Start of input or after consuming a field terminator. */ 294 ++fieldNum; 295 296 /* Note: Can't use a switch here do the 'goto case' to the OuterSwitch. */ 297 if (nextChar == csvQuote) 298 { 299 currState = State.QuotedField; 300 break OuterSwitch; 301 } 302 else 303 { 304 /* Processing state change only. Don't consume the character. */ 305 currState = State.NonQuotedField; 306 goto case State.NonQuotedField; 307 } 308 309 case State.NonQuotedField: 310 switch (nextChar) 311 { 312 default: 313 put(outputStream, nextChar); 314 break OuterSwitch; 315 case csvDelim: 316 put(outputStream, tsvDelim); 317 currState = State.FieldEnd; 318 break OuterSwitch; 319 case tsvDelim: 320 put(outputStream, tsvDelimReplacement); 321 break OuterSwitch; 322 case '\n': 323 put(outputStream, '\n'); 324 ++recordNum; 325 fieldNum = 0; 326 currState = State.FieldEnd; 327 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop; 328 else break OuterSwitch; 329 } 330 331 case State.QuotedField: 332 switch (nextChar) 333 { 334 default: 335 put(outputStream, nextChar); 336 break OuterSwitch; 337 case csvQuote: 338 /* Quote in a quoted field. Need to look at the next character.*/ 339 if (!inputStream.empty) 340 { 341 currState = State.QuoteInQuotedField; 342 } 343 else 344 { 345 /* End of input. A rare case: Quoted field on last line with no 346 * following trailing newline. Reset the state to avoid triggering 347 * an invalid quoted field exception, plus adding additional newline. 348 */ 349 currState = State.FieldEnd; 350 } 351 break OuterSwitch; 352 case '\n': 353 /* Newline in a quoted field. */ 354 put(outputStream, tsvDelimReplacement); 355 break OuterSwitch; 356 case tsvDelim: 357 put(outputStream, tsvDelimReplacement); 358 break OuterSwitch; 359 } 360 361 case State.QuoteInQuotedField: 362 /* Just processed a quote in a quoted field. */ 363 switch (nextChar) 364 { 365 case csvQuote: 366 put(outputStream, csvQuote); 367 currState = State.QuotedField; 368 break OuterSwitch; 369 case csvDelim: 370 put(outputStream, tsvDelim); 371 currState = State.FieldEnd; 372 break OuterSwitch; 373 case '\n': 374 put(outputStream, '\n'); 375 ++recordNum; 376 fieldNum = 0; 377 currState = State.FieldEnd; 378 379 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop; 380 else break OuterSwitch; 381 default: 382 throw new Exception( 383 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 384 (filename == "-") ? "Standard Input" : filename, 385 currFileLineNumber + recordNum)); 386 } 387 } 388 } 389 390 enforce(currState != State.QuotedField, 391 format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d", 392 (filename == "-") ? "Standard Input" : filename, 393 currFileLineNumber + recordNum)); 394 395 if (fieldNum > 0) put(outputStream, '\n'); // Last line w/o terminating newline. 396 } 397 398 unittest 399 { 400 /* Unit tests for the csv2tsv function. 401 * 402 * These unit tests exercise different CSV combinations and escaping cases. The CSV 403 * data content is the same for each corresponding test string, except the delimiters 404 * have been changed. e.g csv6a and csv6b have the same data content. 405 * 406 * A property used in these tests is that changing the CSV delimiters doesn't change 407 * the resulting TSV. However, changing the TSV delimiters will change the TSV result, 408 * as TSV doesn't support having it's delimiters in the data. This allows having a 409 * single TSV expected set that is generated by CSVs with different delimter sets. 410 * 411 * This test set does not test main, file handling, or error messages. These are 412 * handled by tests run against the executable. 413 */ 414 415 /* Default CSV. */ 416 auto csv1a = "a,b,c"; 417 auto csv2a = "a,bc,,,def"; 418 auto csv3a = ",a, b , cd ,"; 419 auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石"; 420 auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\""; 421 auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\""; 422 auto csv7a = "\",\",\",,\",\",,,\""; 423 auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\""; 424 auto csv9a = "\"ab, de\tfg\"\"\nhij\""; 425 auto csv10a = ""; 426 auto csv11a = ","; 427 auto csv12a = ",,"; 428 auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\""; 429 auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\""; 430 auto csv15a = "\"ab, de\tfg\"\"\rhij\""; 431 auto csv16a = "\"ab, de\tfg\"\"\r\nhij\""; 432 auto csv17a = "ab\",ab\"cd"; 433 auto csv18a = "\n\n\n"; 434 auto csv19a = "\t"; 435 auto csv20a = "\t\t"; 436 auto csv21a = "a\n"; 437 auto csv22a = "a,\n"; 438 auto csv23a = "a,b\n"; 439 auto csv24a = ",\n"; 440 auto csv25a = "#"; 441 auto csv26a = "^"; 442 auto csv27a = "#^#"; 443 auto csv28a = "^#^"; 444 auto csv29a = "$"; 445 auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n"; 446 auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n"; 447 auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\""; 448 449 /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */ 450 auto csv1b = "a^b^c"; 451 auto csv2b = "a^bc^^^def"; 452 auto csv3b = "^a^ b ^ cd ^"; 453 auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石"; 454 auto csv5b = "#\n#^#\n\n#^#\n\n\n#"; 455 auto csv6b = "#\t#^#\t\t#^#\t\t\t#"; 456 auto csv7b = "#,#^#,,#^#,,,#"; 457 auto csv8b = "##^#\"#^#\"\"#"; 458 auto csv9b = "#ab, de\tfg\"\nhij#"; 459 auto csv10b = ""; 460 auto csv11b = "^"; 461 auto csv12b = "^^"; 462 auto csv13b = "#\r#^#\r\r#^#\r\r\r#"; 463 auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#"; 464 auto csv15b = "#ab, de\tfg\"\rhij#"; 465 auto csv16b = "#ab, de\tfg\"\r\nhij#"; 466 auto csv17b = "ab\"^ab\"cd"; 467 auto csv18b = "\n\n\n"; 468 auto csv19b = "\t"; 469 auto csv20b = "\t\t"; 470 auto csv21b = "a\n"; 471 auto csv22b = "a^\n"; 472 auto csv23b = "a^b\n"; 473 auto csv24b = "^\n"; 474 auto csv25b = "####"; 475 auto csv26b = "#^#"; 476 auto csv27b = "###^###"; 477 auto csv28b = "#^##^#"; 478 auto csv29b = "$"; 479 auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n"; 480 auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n"; 481 auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#"; 482 483 /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/ 484 auto tsv1 = "a\tb\tc\n"; 485 auto tsv2 = "a\tbc\t\t\tdef\n"; 486 auto tsv3 = "\ta\t b \t cd \t\n"; 487 auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n"; 488 auto tsv5 = " \t \t \n"; 489 auto tsv6 = " \t \t \n"; 490 auto tsv7 = ",\t,,\t,,,\n"; 491 auto tsv8 = "\t\"\t\"\"\n"; 492 auto tsv9 = "ab, de fg\" hij\n"; 493 auto tsv10 = ""; 494 auto tsv11 = "\t\n"; 495 auto tsv12 = "\t\t\n"; 496 auto tsv13 = " \t \t \n"; 497 auto tsv14 = " \t \t \n"; 498 auto tsv15 = "ab, de fg\" hij\n"; 499 auto tsv16 = "ab, de fg\" hij\n"; 500 auto tsv17 = "ab\"\tab\"cd\n"; 501 auto tsv18 = "\n\n\n"; 502 auto tsv19 = " \n"; 503 auto tsv20 = " \n"; 504 auto tsv21 = "a\n"; 505 auto tsv22 = "a\t\n"; 506 auto tsv23 = "a\tb\n"; 507 auto tsv24 = "\t\n"; 508 auto tsv25 = "#\n"; 509 auto tsv26 = "^\n"; 510 auto tsv27 = "#^#\n"; 511 auto tsv28 = "^#^\n"; 512 auto tsv29 = "$\n"; 513 auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n"; 514 auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n"; 515 auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n"; 516 517 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab. 518 * This will also result in different replacements when TAB and $ appear in the CSV. 519 */ 520 auto tsv1_x = "a$b$c\n"; 521 auto tsv2_x = "a$bc$$$def\n"; 522 auto tsv3_x = "$a$ b $ cd $\n"; 523 auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 524 auto tsv5_x = " $ $ \n"; 525 auto tsv6_x = "\t$\t\t$\t\t\t\n"; 526 auto tsv7_x = ",$,,$,,,\n"; 527 auto tsv8_x = "$\"$\"\"\n"; 528 auto tsv9_x = "ab, de\tfg\" hij\n"; 529 auto tsv10_x = ""; 530 auto tsv11_x = "$\n"; 531 auto tsv12_x = "$$\n"; 532 auto tsv13_x = " $ $ \n"; 533 auto tsv14_x = " $ $ \n"; 534 auto tsv15_x = "ab, de\tfg\" hij\n"; 535 auto tsv16_x = "ab, de\tfg\" hij\n"; 536 auto tsv17_x = "ab\"$ab\"cd\n"; 537 auto tsv18_x = "\n\n\n"; 538 auto tsv19_x = "\t\n"; 539 auto tsv20_x = "\t\t\n"; 540 auto tsv21_x = "a\n"; 541 auto tsv22_x = "a$\n"; 542 auto tsv23_x = "a$b\n"; 543 auto tsv24_x = "$\n"; 544 auto tsv25_x = "#\n"; 545 auto tsv26_x = "^\n"; 546 auto tsv27_x = "#^#\n"; 547 auto tsv28_x = "^#^\n"; 548 auto tsv29_x = " \n"; 549 auto tsv30_x = " $ \n $ $ \n^# $ #^$# ^$^ #\n"; 550 auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 551 auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 552 553 /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab, 554 * and with the delimiter/newline replacement string being |--|. Basically, newlines 555 * and '$' in the original data are replaced by |--|. 556 */ 557 auto tsv1_y = "a$b$c\n"; 558 auto tsv2_y = "a$bc$$$def\n"; 559 auto tsv3_y = "$a$ b $ cd $\n"; 560 auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n"; 561 auto tsv5_y = "|--|$|--||--|$|--||--||--|\n"; 562 auto tsv6_y = "\t$\t\t$\t\t\t\n"; 563 auto tsv7_y = ",$,,$,,,\n"; 564 auto tsv8_y = "$\"$\"\"\n"; 565 auto tsv9_y = "ab, de\tfg\"|--|hij\n"; 566 auto tsv10_y = ""; 567 auto tsv11_y = "$\n"; 568 auto tsv12_y = "$$\n"; 569 auto tsv13_y = "|--|$|--||--|$|--||--||--|\n"; 570 auto tsv14_y = "|--|$|--||--|$|--||--||--|\n"; 571 auto tsv15_y = "ab, de\tfg\"|--|hij\n"; 572 auto tsv16_y = "ab, de\tfg\"|--|hij\n"; 573 auto tsv17_y = "ab\"$ab\"cd\n"; 574 auto tsv18_y = "\n\n\n"; 575 auto tsv19_y = "\t\n"; 576 auto tsv20_y = "\t\t\n"; 577 auto tsv21_y = "a\n"; 578 auto tsv22_y = "a$\n"; 579 auto tsv23_y = "a$b\n"; 580 auto tsv24_y = "$\n"; 581 auto tsv25_y = "#\n"; 582 auto tsv26_y = "^\n"; 583 auto tsv27_y = "#^#\n"; 584 auto tsv28_y = "^#^\n"; 585 auto tsv29_y = "|--|\n"; 586 auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n"; 587 auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n"; 588 auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n"; 589 590 auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a, 591 csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a, 592 csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a, 593 csv31a, csv32a]; 594 595 auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b, 596 csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b, 597 csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b, 598 csv31b, csv32b]; 599 600 auto tsvSet1 = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10, 601 tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20, 602 tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30, 603 tsv31, tsv32]; 604 605 auto tsvSet1_x = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x, 606 tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x, 607 tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x, 608 tsv31_x, tsv32_x]; 609 610 auto tsvSet1_y = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y, 611 tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y, 612 tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y, 613 tsv31_y, tsv32_y]; 614 615 foreach (i, csva, csvb, tsv, tsv_x, tsv_y; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y)) 616 { 617 import std.conv : to; 618 619 /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 620 ubyte[] csvInputA = cast(ubyte[])csva; 621 ubyte[] csvInputB = cast(ubyte[])csvb; 622 623 /* CSV Set A vs TSV expected. */ 624 auto tsvResultA = appender!(char[])(); 625 csv2tsv(csvInputA, tsvResultA, "csvInputA_defaultTSV", i); 626 assert(tsv == tsvResultA.data, 627 format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 628 i + 1, csva, tsv, tsvResultA.data)); 629 630 /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/ 631 auto tsvResultB = appender!(char[])(); 632 csv2tsv(csvInputB, tsvResultB, "csvInputB_defaultTSV", i, '#', '^'); 633 assert(tsv == tsvResultB.data, 634 format("Unittest failure. tsv != tsvResultB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 635 i + 1, csvb, tsv, tsvResultB.data)); 636 637 /* CSV Set A and TSV with $ separator.*/ 638 csvInputA = cast(ubyte[])csva; 639 auto tsvResult_XA = appender!(char[])(); 640 csv2tsv(csvInputA, tsvResult_XA, "csvInputA_TSV_WithDollarDelimiter", i, '"', ',', '$'); 641 assert(tsv_x == tsvResult_XA.data, 642 format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 643 i + 1, csva, tsv_x, tsvResult_XA.data)); 644 645 /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/ 646 csvInputB = cast(ubyte[])csvb; 647 auto tsvResult_XB = appender!(char[])(); 648 csv2tsv(csvInputB, tsvResult_XB, "csvInputB__TSV_WithDollarDelimiter", i, '#', '^', '$'); 649 assert(tsv_x == tsvResult_XB.data, 650 format("Unittest failure. tsv_x != tsvResult_XB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 651 i + 1, csvb, tsv_x, tsvResult_XB.data)); 652 653 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */ 654 csvInputA = cast(ubyte[])csva; 655 auto tsvResult_YA = appender!(char[])(); 656 csv2tsv(csvInputA, tsvResult_YA, "csvInputA_TSV_WithDollarAndDelimReplacement", i, '"', ',', '$', "|--|"); 657 assert(tsv_y == tsvResult_YA.data, 658 format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 659 i + 1, csva, tsv_y, tsvResult_YA.data)); 660 661 /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/ 662 csvInputB = cast(ubyte[])csvb; 663 auto tsvResult_YB = appender!(char[])(); 664 csv2tsv(csvInputB, tsvResult_YB, "csvInputB__TSV_WithDollarAndDelimReplacement", i, '#', '^', '$', "|--|"); 665 assert(tsv_y == tsvResult_YB.data, 666 format("Unittest failure. tsv_y != tsvResult_YB.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 667 i + 1, csvb, tsv_y, tsvResult_YB.data)); 668 669 } 670 } 671 672 unittest 673 { 674 /* Unit tests for 'maxRecords' feature of the csv2tsv function. 675 */ 676 677 /* Input CSV. */ 678 auto csv1 = ""; 679 auto csv2 = ","; 680 auto csv3 = "a"; 681 auto csv4 = "a\n"; 682 auto csv5 = "a\nb"; 683 auto csv6 = "a\nb\n"; 684 auto csv7 = "a\nb\nc"; 685 auto csv8 = "a\nb\nc\n"; 686 auto csv9 = "a,aa"; 687 auto csv10 = "a,aa\n"; 688 auto csv11 = "a,aa\nb,bb"; 689 auto csv12 = "a,aa\nb,bb\n"; 690 auto csv13 = "a,aa\nb,bb\nc,cc"; 691 auto csv14 = "a,aa\nb,bb\nc,cc\n"; 692 693 auto csv15 = "\"a\",\"aa\""; 694 auto csv16 = "\"a\",\"aa\"\n"; 695 auto csv17 = "\"a\",\"aa\"\n\"b\",\"bb\""; 696 auto csv18 = "\"a\",\"aa\"\n\"b\",\"bb\"\n"; 697 auto csv19 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\""; 698 auto csv20 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"\n"; 699 700 /* TSV with max 1 record. */ 701 auto tsv1_max1 = ""; 702 auto tsv2_max1 = "\t\n"; 703 auto tsv3_max1 = "a\n"; 704 auto tsv4_max1 = "a\n"; 705 auto tsv5_max1 = "a\n"; 706 auto tsv6_max1 = "a\n"; 707 auto tsv7_max1 = "a\n"; 708 auto tsv8_max1 = "a\n"; 709 auto tsv9_max1 = "a\taa\n"; 710 auto tsv10_max1 = "a\taa\n"; 711 auto tsv11_max1 = "a\taa\n"; 712 auto tsv12_max1 = "a\taa\n"; 713 auto tsv13_max1 = "a\taa\n"; 714 auto tsv14_max1 = "a\taa\n"; 715 716 auto tsv15_max1 = "a\taa\n"; 717 auto tsv16_max1 = "a\taa\n"; 718 auto tsv17_max1 = "a\taa\n"; 719 auto tsv18_max1 = "a\taa\n"; 720 auto tsv19_max1 = "a\taa\n"; 721 auto tsv20_max1 = "a\taa\n"; 722 723 /* Remaining TSV converted after first call. */ 724 auto tsv1_max1_rest = ""; 725 auto tsv2_max1_rest = ""; 726 auto tsv3_max1_rest = ""; 727 auto tsv4_max1_rest = ""; 728 auto tsv5_max1_rest = "b\n"; 729 auto tsv6_max1_rest = "b\n"; 730 auto tsv7_max1_rest = "b\nc\n"; 731 auto tsv8_max1_rest = "b\nc\n"; 732 auto tsv9_max1_rest = ""; 733 auto tsv10_max1_rest = ""; 734 auto tsv11_max1_rest = "b\tbb\n"; 735 auto tsv12_max1_rest = "b\tbb\n"; 736 auto tsv13_max1_rest = "b\tbb\nc\tcc\n"; 737 auto tsv14_max1_rest = "b\tbb\nc\tcc\n"; 738 739 auto tsv15_max1_rest = ""; 740 auto tsv16_max1_rest = ""; 741 auto tsv17_max1_rest = "b\tbb\n"; 742 auto tsv18_max1_rest = "b\tbb\n"; 743 auto tsv19_max1_rest = "b\tbb\nc\tcc\n"; 744 auto tsv20_max1_rest = "b\tbb\nc\tcc\n"; 745 746 /* TSV with max 2 records. */ 747 auto tsv1_max2 = ""; 748 auto tsv2_max2 = "\t\n"; 749 auto tsv3_max2 = "a\n"; 750 auto tsv4_max2 = "a\n"; 751 auto tsv5_max2 = "a\nb\n"; 752 auto tsv6_max2 = "a\nb\n"; 753 auto tsv7_max2 = "a\nb\n"; 754 auto tsv8_max2 = "a\nb\n"; 755 auto tsv9_max2 = "a\taa\n"; 756 auto tsv10_max2 = "a\taa\n"; 757 auto tsv11_max2 = "a\taa\nb\tbb\n"; 758 auto tsv12_max2 = "a\taa\nb\tbb\n"; 759 auto tsv13_max2 = "a\taa\nb\tbb\n"; 760 auto tsv14_max2 = "a\taa\nb\tbb\n"; 761 762 auto tsv15_max2 = "a\taa\n"; 763 auto tsv16_max2 = "a\taa\n"; 764 auto tsv17_max2 = "a\taa\nb\tbb\n"; 765 auto tsv18_max2 = "a\taa\nb\tbb\n"; 766 auto tsv19_max2 = "a\taa\nb\tbb\n"; 767 auto tsv20_max2 = "a\taa\nb\tbb\n"; 768 769 /* Remaining TSV converted after first call. */ 770 auto tsv1_max2_rest = ""; 771 auto tsv2_max2_rest = ""; 772 auto tsv3_max2_rest = ""; 773 auto tsv4_max2_rest = ""; 774 auto tsv5_max2_rest = ""; 775 auto tsv6_max2_rest = ""; 776 auto tsv7_max2_rest = "c\n"; 777 auto tsv8_max2_rest = "c\n"; 778 auto tsv9_max2_rest = ""; 779 auto tsv10_max2_rest = ""; 780 auto tsv11_max2_rest = ""; 781 auto tsv12_max2_rest = ""; 782 auto tsv13_max2_rest = "c\tcc\n"; 783 auto tsv14_max2_rest = "c\tcc\n"; 784 785 auto tsv15_max2_rest = ""; 786 auto tsv16_max2_rest = ""; 787 auto tsv17_max2_rest = ""; 788 auto tsv18_max2_rest = ""; 789 auto tsv19_max2_rest = "c\tcc\n"; 790 auto tsv20_max2_rest = "c\tcc\n"; 791 792 auto csvSet1 = 793 [csv1, csv2, csv3, csv4, csv5, csv6, csv7, 794 csv8, csv9, csv10, csv11, csv12, csv13, csv14, 795 csv15, csv16, csv17, csv18, csv19, csv20 ]; 796 797 auto tsvMax1Set1 = 798 [tsv1_max1, tsv2_max1, tsv3_max1, tsv4_max1, tsv5_max1, tsv6_max1, tsv7_max1, 799 tsv8_max1, tsv9_max1, tsv10_max1, tsv11_max1, tsv12_max1, tsv13_max1, tsv14_max1, 800 tsv15_max1, tsv16_max1, tsv17_max1, tsv18_max1, tsv19_max1, tsv20_max1]; 801 802 auto tsvMax1RestSet1 = 803 [tsv1_max1_rest, tsv2_max1_rest, tsv3_max1_rest, tsv4_max1_rest, tsv5_max1_rest, tsv6_max1_rest, tsv7_max1_rest, 804 tsv8_max1_rest, tsv9_max1_rest, tsv10_max1_rest, tsv11_max1_rest, tsv12_max1_rest, tsv13_max1_rest, tsv14_max1_rest, 805 tsv15_max1_rest, tsv16_max1_rest, tsv17_max1_rest, tsv18_max1_rest, tsv19_max1_rest, tsv20_max1_rest]; 806 807 auto tsvMax2Set1 = 808 [tsv1_max2, tsv2_max2, tsv3_max2, tsv4_max2, tsv5_max2, tsv6_max2, tsv7_max2, 809 tsv8_max2, tsv9_max2, tsv10_max2, tsv11_max2, tsv12_max2, tsv13_max2, tsv14_max2, 810 tsv15_max2, tsv16_max2, tsv17_max2, tsv18_max2, tsv19_max2, tsv20_max2]; 811 812 auto tsvMax2RestSet1 = 813 [tsv1_max2_rest, tsv2_max2_rest, tsv3_max2_rest, tsv4_max2_rest, tsv5_max2_rest, tsv6_max2_rest, tsv7_max2_rest, 814 tsv8_max2_rest, tsv9_max2_rest, tsv10_max2_rest, tsv11_max2_rest, tsv12_max2_rest, tsv13_max2_rest, tsv14_max2_rest, 815 tsv15_max2_rest, tsv16_max2_rest, tsv17_max2_rest, tsv18_max2_rest, tsv19_max2_rest, tsv20_max2_rest]; 816 817 foreach (i, csv, tsv_max1, tsv_max1_rest, tsv_max2, tsv_max2_rest; 818 lockstep(csvSet1, tsvMax1Set1, tsvMax1RestSet1, tsvMax2Set1, tsvMax2RestSet1)) 819 { 820 /* Byte stream for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */ 821 ubyte[] csvInput = cast(ubyte[])csv; 822 823 /* Call with maxRecords == 1. */ 824 auto tsvMax1Result = appender!(char[])(); 825 csv2tsv(csvInput, tsvMax1Result, "maxRecords-one", i, '"', ',', '\t', " ", NullableSizeT(1)); 826 assert(tsv_max1 == tsvMax1Result.data, 827 format("Unittest failure. tsv_max1 != tsvMax1Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 828 i + 1, csv, tsv_max1, tsvMax1Result.data)); 829 830 /* Follow-up call getting all records remaining after the maxRecords==1 call. */ 831 auto tsvMax1RestResult = appender!(char[])(); 832 csv2tsv(csvInput, tsvMax1RestResult, "maxRecords-one-followup", i); 833 assert(tsv_max1_rest == tsvMax1RestResult.data, 834 format("Unittest failure. tsv_max1_rest != tsvMax1RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 835 i + 1, csv, tsv_max1_rest, tsvMax1RestResult.data)); 836 837 /* Reset the input stream for maxRecords == 2. */ 838 csvInput = cast(ubyte[])csv; 839 840 /* Call with maxRecords == 2. */ 841 auto tsvMax2Result = appender!(char[])(); 842 csv2tsv(csvInput, tsvMax2Result, "maxRecords-two", i, '"', ',', '\t', " ", NullableSizeT(2)); 843 assert(tsv_max2 == tsvMax2Result.data, 844 format("Unittest failure. tsv_max2 != tsvMax2Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 845 i + 1, csv, tsv_max2, tsvMax2Result.data)); 846 847 /* Follow-up call getting all records remaining after the maxRecords==2 call. */ 848 auto tsvMax2RestResult = appender!(char[])(); 849 csv2tsv(csvInput, tsvMax2RestResult, "maxRecords-two-followup", i); 850 assert(tsv_max2_rest == tsvMax2RestResult.data, 851 format("Unittest failure. tsv_max2_rest != tsvMax2RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n", 852 i + 1, csv, tsv_max2_rest, tsvMax2RestResult.data)); 853 } 854 }