1 /**
2 Convert CSV formatted data to TSV format.
3 
4 This program converts comma-separated value data to tab-separated format.
5 
6 Copyright (c) 2016-2018, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 
12 module csv2tsv;
13 
14 import std.stdio;
15 import std.format : format;
16 import std.range;
17 import std.traits : Unqual;
18 import std.typecons : Nullable, tuple;
19 
20 auto helpText = q"EOS
21 Synopsis: csv2tsv [options] [file...]
22 
23 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records
24 are read from files or standard input, converted records written to standard output.
25 Use '--help-verbose' for details the CSV formats accepted.
26 
27 Options:
28 EOS";
29 
30 auto helpTextVerbose = q"EOS
31 Synopsis: csv2tsv [options] [file...]
32 
33 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records
34 are read from files or standard input, converted records written to standard output.
35 
36 Both formats represent tabular data, each record on its own line, fields separated
37 by a delimiter character. The key difference is that CSV uses escape sequences to
38 represent newlines and field separators in the data, whereas TSV disallows these
39 characters in the data. The most common field delimiters are comma for CSV and tab
40 for TSV, but any character can be used.
41 
42 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters,
43 and replacing newlines and field delimiters in the data. By default, newlines and
44 field delimiters in the data are replaced by spaces. Most details are customizable.
45 
46 There is no single spec for CSV, any number of variants can be found. The escape
47 syntax is common enough: fields containing newlines or field delimiters are placed
48 in double quotes. Inside a quoted field, a double quote is represented by a pair of
49 double quotes. As with field separators, the quoting character is customizable.
50 
51 Behaviors of this program that often vary between CSV implementations:
52   * Newlines are supported in quoted fields.
53   * Double quotes are permitted in a non-quoted field. However, a field starting
54     with a quote must follow quoting rules.
55   * Each record can have a different numbers of fields.
56   * The three common forms of newlines are supported: CR, CRLF, LF.
57   * A newline will be added if the file does not end with one.
58   * No whitespace trimming is done.
59 
60 This program does not validate CSV correctness, but will terminate with an error
61 upon reaching an inconsistent state. Improperly terminated quoted fields are the
62 primary cause.
63 
64 UTF-8 input is assumed. Convert other encodings prior to invoking this tool.
65 
66 Options:
67 EOS";
68 
69 /** Container for command line options.
70  */
71 struct Csv2tsvOptions
72 {
73     string programName;
74     bool helpVerbose = false;          // --help-verbose
75     bool hasHeader = false;            // --H|header
76     char csvQuoteChar = '"';           // --q|quote
77     char csvDelimChar = ',';           // --c|csv-delim
78     char tsvDelimChar = '\t';          // --t|tsv-delim
79     string tsvDelimReplacement = " ";  // --r|replacement
80     bool versionWanted = false;        // --V|version
81 
82     auto processArgs (ref string[] cmdArgs)
83     {
84         import std.algorithm : canFind;
85         import std.getopt;
86         import std.path : baseName, stripExtension;
87 
88         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
89 
90         try
91         {
92             auto r = getopt(
93                 cmdArgs,
94                 "help-verbose",  "     Print full help.", &helpVerbose,
95                 std.getopt.config.caseSensitive,
96                 "H|header",      "     Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader,
97                 std.getopt.config.caseSensitive,
98                 "q|quote",       "CHR  Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar,
99                 "c|csv-delim",   "CHR  Field delimiter in CSV data. Default: comma (,).", &csvDelimChar,
100                 "t|tsv-delim",   "CHR  Field delimiter in TSV data. Default: TAB", &tsvDelimChar,
101                 "r|replacement", "STR  Replacement for newline and TSV field delimiters found in CSV input. Default: Space.", &tsvDelimReplacement,
102                 std.getopt.config.caseSensitive,
103                 "V|version",     "     Print version information and exit.", &versionWanted,
104                 std.getopt.config.caseInsensitive,
105                 );
106 
107             if (r.helpWanted)
108             {
109                 defaultGetoptPrinter(helpText, r.options);
110                 return tuple(false, 0);
111             }
112             else if (helpVerbose)
113             {
114                 defaultGetoptPrinter(helpTextVerbose, r.options);
115                 return tuple(false, 0);
116             }
117             else if (versionWanted)
118             {
119                 import tsvutils_version;
120                 writeln(tsvutilsVersionNotice("csv2tsv"));
121                 return tuple(false, 0);
122             }
123 
124             /* Consistency checks. */
125             if (csvQuoteChar == '\n' || csvQuoteChar == '\r')
126             {
127                 throw new Exception ("CSV quote character cannot be newline (--q|quote).");
128             }
129 
130             if (csvQuoteChar == csvDelimChar)
131             {
132                 throw new Exception("CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim).");
133             }
134 
135             if (csvQuoteChar == tsvDelimChar)
136             {
137                 throw new Exception("CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim).");
138             }
139 
140             if (csvDelimChar == '\n' || csvDelimChar == '\r')
141             {
142                 throw new Exception ("CSV field delimiter cannot be newline (--c|csv-delim).");
143             }
144 
145             if (tsvDelimChar == '\n' || tsvDelimChar == '\r')
146             {
147                 throw new Exception ("TSV field delimiter cannot be newline (--t|tsv-delimiter).");
148             }
149 
150             if (canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement))
151             {
152                 throw new Exception ("Replacement character cannot contain newlines or TSV field delimiters (--r|replacement).");
153             }
154         }
155         catch (Exception exc)
156         {
157             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
158             return tuple(false, 1);
159         }
160         return tuple(true, 0);
161     }
162 }
163 
164 version(unittest)
165 {
166     // No main in unittest
167 }
168 else
169 {
170     int main(string[] cmdArgs)
171     {
172         /* When running in DMD code coverage mode, turn on report merging. */
173         version(D_Coverage) version(DigitalMars)
174         {
175             import core.runtime : dmd_coverSetMerge;
176             dmd_coverSetMerge(true);
177         }
178 
179         Csv2tsvOptions cmdopt;
180         auto r = cmdopt.processArgs(cmdArgs);
181         if (!r[0]) return r[1];
182         version(LDC_Profile)
183         {
184             import ldc.profile : resetAll;
185             resetAll();
186         }
187         try csv2tsvFiles(cmdopt, cmdArgs[1..$]);
188         catch (Exception exc)
189         {
190             writeln();
191             stdin.flush();
192             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
193             return 1;
194         }
195 
196         return 0;
197     }
198 }
199 
200 /* This uses a D feature where a type can reserve a single value to represent null. */
201 alias NullableSizeT = Nullable!(size_t, size_t.max);
202 
203 
204 /** csv2tsvFiles reads multiple files and standard input and writes the results to
205  * standard output.
206  */
207 void csv2tsvFiles(in Csv2tsvOptions cmdopt, in string[] inputFiles)
208 {
209     import std.algorithm : joiner;
210     import tsvutil : BufferedOutputRange;
211 
212     ubyte[1024 * 1024] fileRawBuf;
213     ubyte[] stdinRawBuf = fileRawBuf[0..1024];
214     auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout);
215     bool firstFile = true;
216 
217     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
218     {
219         auto ubyteChunkedStream = (filename == "-") ?
220             stdin.byChunk(stdinRawBuf) : filename.File.byChunk(fileRawBuf);
221         auto ubyteStream = ubyteChunkedStream.joiner;
222 
223         if (firstFile || !cmdopt.hasHeader)
224         {
225             csv2tsv(ubyteStream, stdoutWriter, filename, 0,
226                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
227                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement);
228         }
229         else
230         {
231             /* Don't write the header on subsequent files. Write the first
232              * record to a null sink instead.
233              */
234             auto nullWriter = NullSink();
235             csv2tsv(ubyteStream, nullWriter, filename, 0,
236                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
237                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement,
238                     NullableSizeT(1));
239             csv2tsv(ubyteStream, stdoutWriter, filename, 1,
240                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
241                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement);
242         }
243         firstFile = false;
244     }
245 }
246 
247 /** Read CSV from an input source, covert to TSV and write to an output source.
248  *
249  * Params:
250  *   InputRange          =  A ubyte input range to read CSV text from. A ubyte range
251  *                          matched byChunck. It also avoids convesion to dchar by front().
252  *   OutputRange         =  An output range to write TSV text to.
253  *   filename            =  Name of file to use when reporting errors. A descriptive name
254  *                       =  can be used in lieu of a file name.
255  *   currFileLineNumber  =  First line being processed. Used when reporting errors. Needed
256  *                          only when part of the input has already been processed.
257  *   csvQuote            =  The quoting character used in the input CSV file.
258  *   csvDelim            =  The field delimiter character used in the input CSV file.
259  *   tsvDelim            =  The field delimiter character to use in the generated TSV file.
260  *   tsvDelimReplacement =  A string to use when replacing newlines and TSV field delimiters
261  *                          occurring in CSV fields.
262  *   maxRecords          =  The maximum number of records to process (output lines). This is
263  *                          intended to support processing the header line separately.
264  *
265  * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
266  *         line number where the error was identified.
267  */
268 void csv2tsv(InputRange, OutputRange)
269     (auto ref InputRange inputStream, auto ref OutputRange outputStream,
270      string filename = "(none)", size_t currFileLineNumber = 0,
271      const char csvQuote = '"', const char csvDelim = ',', const char tsvDelim = '\t',
272      string tsvDelimReplacement = " ",
273      NullableSizeT maxRecords=NullableSizeT.init,
274      )
275 if (isInputRange!InputRange && isOutputRange!(OutputRange, char) &&
276     is(Unqual!(ElementType!InputRange) == ubyte))
277 {
278     enum State { FieldEnd, NonQuotedField, QuotedField, QuoteInQuotedField }
279 
280     State currState = State.FieldEnd;
281     size_t recordNum = 1;      // Record number. Output line number.
282     size_t fieldNum = 0;       // Field on current line.
283 
284 InputLoop: while (!inputStream.empty)
285     {
286         char nextChar = inputStream.front;
287         inputStream.popFront;
288 
289         if (nextChar == '\r')
290         {
291             /* Collapse newline cases to '\n'. */
292             if (!inputStream.empty && inputStream.front == '\n')
293             {
294                 inputStream.popFront;
295             }
296             nextChar = '\n';
297         }
298 
299     OuterSwitch: final switch (currState)
300         {
301         case State.FieldEnd:
302             /* Start of input or after consuming a field terminator. */
303             ++fieldNum;
304 
305             /* Note: Can't use a switch here do the 'goto case' to the OuterSwitch.  */
306             if (nextChar == csvQuote)
307             {
308                 currState = State.QuotedField;
309                 break OuterSwitch;
310             }
311             else
312             {
313                 /* Processing state change only. Don't consume the character. */
314                 currState = State.NonQuotedField;
315                 goto case State.NonQuotedField;
316             }
317 
318         case State.NonQuotedField:
319             switch (nextChar)
320             {
321             default:
322                 put(outputStream, nextChar);
323                 break OuterSwitch;
324             case csvDelim:
325                 put(outputStream, tsvDelim);
326                 currState = State.FieldEnd;
327                 break OuterSwitch;
328             case tsvDelim:
329                 put(outputStream, tsvDelimReplacement);
330                 break OuterSwitch;
331             case '\n':
332                 put(outputStream, '\n');
333                 ++recordNum;
334                 fieldNum = 0;
335                 currState = State.FieldEnd;
336                 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop;
337                 else break OuterSwitch;
338             }
339 
340         case State.QuotedField:
341             switch (nextChar)
342             {
343             default:
344                 put(outputStream, nextChar);
345                 break OuterSwitch;
346             case csvQuote:
347                 /* Quote in a quoted field. Need to look at the next character.*/
348                 if (!inputStream.empty)
349                 {
350                     currState = State.QuoteInQuotedField;
351                 }
352                 else
353                 {
354                     /* End of input. A rare case: Quoted field on last line with no
355                      * following trailing newline. Reset the state to avoid triggering
356                      * an invalid quoted field exception, plus adding additional newline.
357                      */
358                     currState = State.FieldEnd;
359                 }
360                 break OuterSwitch;
361             case '\n':
362                 /* Newline in a quoted field. */
363                 put(outputStream, tsvDelimReplacement);
364                 break OuterSwitch;
365             case tsvDelim:
366                 put(outputStream, tsvDelimReplacement);
367                 break OuterSwitch;
368             }
369 
370         case State.QuoteInQuotedField:
371             /* Just processed a quote in a quoted field. */
372             switch (nextChar)
373             {
374             case csvQuote:
375                 put(outputStream, csvQuote);
376                 currState = State.QuotedField;
377                 break OuterSwitch;
378             case csvDelim:
379                 put(outputStream, tsvDelim);
380                 currState = State.FieldEnd;
381                 break OuterSwitch;
382             case '\n':
383                 put(outputStream, '\n');
384                 ++recordNum;
385                 fieldNum = 0;
386                 currState = State.FieldEnd;
387 
388                 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop;
389                 else break OuterSwitch;
390             default:
391                 throw new Exception(
392                     format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
393                            (filename == "-") ? "Standard Input" : filename,
394                            currFileLineNumber + recordNum));
395             }
396         }
397     }
398 
399     if (currState == State.QuotedField)
400     {
401         throw new Exception(
402             format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
403                    (filename == "-") ? "Standard Input" : filename,
404                    currFileLineNumber + recordNum));
405     }
406 
407     if (fieldNum > 0) put(outputStream, '\n');    // Last line w/o terminating newline.
408 }
409 
410 unittest
411 {
412     /* Unit tests for the csv2tsv function.
413      *
414      * These unit tests exercise different CSV combinations and escaping cases. The CSV
415      * data content is the same for each corresponding test string, except the delimiters
416      * have been changed. e.g csv6a and csv6b have the same data content.
417      *
418      * A property used in these tests is that changing the CSV delimiters doesn't change
419      * the resulting TSV. However, changing the TSV delimiters will change the TSV result,
420      * as TSV doesn't support having it's delimiters in the data. This allows having a
421      * single TSV expected set that is generated by CSVs with different delimter sets.
422      *
423      * This test set does not test main, file handling, or error messages. These are
424      * handled by tests run against the executable.
425      */
426 
427     /* Default CSV. */
428     auto csv1a = "a,b,c";
429     auto csv2a = "a,bc,,,def";
430     auto csv3a = ",a, b , cd ,";
431     auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石";
432     auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\"";
433     auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\"";
434     auto csv7a = "\",\",\",,\",\",,,\"";
435     auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\"";
436     auto csv9a = "\"ab, de\tfg\"\"\nhij\"";
437     auto csv10a = "";
438     auto csv11a = ",";
439     auto csv12a = ",,";
440     auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\"";
441     auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\"";
442     auto csv15a = "\"ab, de\tfg\"\"\rhij\"";
443     auto csv16a = "\"ab, de\tfg\"\"\r\nhij\"";
444     auto csv17a = "ab\",ab\"cd";
445     auto csv18a = "\n\n\n";
446     auto csv19a = "\t";
447     auto csv20a = "\t\t";
448     auto csv21a = "a\n";
449     auto csv22a = "a,\n";
450     auto csv23a = "a,b\n";
451     auto csv24a = ",\n";
452     auto csv25a = "#";
453     auto csv26a = "^";
454     auto csv27a = "#^#";
455     auto csv28a = "^#^";
456     auto csv29a = "$";
457     auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n";
458     auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n";
459     auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\"";
460 
461     /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */
462     auto csv1b = "a^b^c";
463     auto csv2b = "a^bc^^^def";
464     auto csv3b = "^a^ b ^ cd ^";
465     auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石";
466     auto csv5b = "#\n#^#\n\n#^#\n\n\n#";
467     auto csv6b = "#\t#^#\t\t#^#\t\t\t#";
468     auto csv7b = "#,#^#,,#^#,,,#";
469     auto csv8b = "##^#\"#^#\"\"#";
470     auto csv9b = "#ab, de\tfg\"\nhij#";
471     auto csv10b = "";
472     auto csv11b = "^";
473     auto csv12b = "^^";
474     auto csv13b = "#\r#^#\r\r#^#\r\r\r#";
475     auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#";
476     auto csv15b = "#ab, de\tfg\"\rhij#";
477     auto csv16b = "#ab, de\tfg\"\r\nhij#";
478     auto csv17b = "ab\"^ab\"cd";
479     auto csv18b = "\n\n\n";
480     auto csv19b = "\t";
481     auto csv20b = "\t\t";
482     auto csv21b = "a\n";
483     auto csv22b = "a^\n";
484     auto csv23b = "a^b\n";
485     auto csv24b = "^\n";
486     auto csv25b = "####";
487     auto csv26b = "#^#";
488     auto csv27b = "###^###";
489     auto csv28b = "#^##^#";
490     auto csv29b = "$";
491     auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n";
492     auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n";
493     auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#";
494 
495     /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/
496     auto tsv1 = "a\tb\tc\n";
497     auto tsv2 = "a\tbc\t\t\tdef\n";
498     auto tsv3 = "\ta\t b \t cd \t\n";
499     auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
500     auto tsv5 = " \t  \t   \n";
501     auto tsv6 = " \t  \t   \n";
502     auto tsv7 = ",\t,,\t,,,\n";
503     auto tsv8 = "\t\"\t\"\"\n";
504     auto tsv9 = "ab, de fg\" hij\n";
505     auto tsv10 = "";
506     auto tsv11 = "\t\n";
507     auto tsv12 = "\t\t\n";
508     auto tsv13 = " \t  \t   \n";
509     auto tsv14 = " \t  \t   \n";
510     auto tsv15 = "ab, de fg\" hij\n";
511     auto tsv16 = "ab, de fg\" hij\n";
512     auto tsv17 = "ab\"\tab\"cd\n";
513     auto tsv18 = "\n\n\n";
514     auto tsv19 = " \n";
515     auto tsv20 = "  \n";
516     auto tsv21 = "a\n";
517     auto tsv22 = "a\t\n";
518     auto tsv23 = "a\tb\n";
519     auto tsv24 = "\t\n";
520     auto tsv25 = "#\n";
521     auto tsv26 = "^\n";
522     auto tsv27 = "#^#\n";
523     auto tsv28 = "^#^\n";
524     auto tsv29 = "$\n";
525     auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
526     auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
527     auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
528 
529     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab.
530      * This will also result in different replacements when TAB and $ appear in the CSV.
531      */
532     auto tsv1_x = "a$b$c\n";
533     auto tsv2_x = "a$bc$$$def\n";
534     auto tsv3_x = "$a$ b $ cd $\n";
535     auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n";
536     auto tsv5_x = " $  $   \n";
537     auto tsv6_x = "\t$\t\t$\t\t\t\n";
538     auto tsv7_x = ",$,,$,,,\n";
539     auto tsv8_x = "$\"$\"\"\n";
540     auto tsv9_x = "ab, de\tfg\" hij\n";
541     auto tsv10_x = "";
542     auto tsv11_x = "$\n";
543     auto tsv12_x = "$$\n";
544     auto tsv13_x = " $  $   \n";
545     auto tsv14_x = " $  $   \n";
546     auto tsv15_x = "ab, de\tfg\" hij\n";
547     auto tsv16_x = "ab, de\tfg\" hij\n";
548     auto tsv17_x = "ab\"$ab\"cd\n";
549     auto tsv18_x = "\n\n\n";
550     auto tsv19_x = "\t\n";
551     auto tsv20_x = "\t\t\n";
552     auto tsv21_x = "a\n";
553     auto tsv22_x = "a$\n";
554     auto tsv23_x = "a$b\n";
555     auto tsv24_x = "$\n";
556     auto tsv25_x = "#\n";
557     auto tsv26_x = "^\n";
558     auto tsv27_x = "#^#\n";
559     auto tsv28_x = "^#^\n";
560     auto tsv29_x = " \n";
561     auto tsv30_x = " $ \n $  $  \n^# $ #^$# ^$^ #\n";
562     auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
563     auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
564 
565     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab,
566      * and with the delimiter/newline replacement string being |--|. Basically, newlines
567      * and '$' in the original data are replaced by |--|.
568      */
569     auto tsv1_y = "a$b$c\n";
570     auto tsv2_y = "a$bc$$$def\n";
571     auto tsv3_y = "$a$ b $ cd $\n";
572     auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n";
573     auto tsv5_y = "|--|$|--||--|$|--||--||--|\n";
574     auto tsv6_y = "\t$\t\t$\t\t\t\n";
575     auto tsv7_y = ",$,,$,,,\n";
576     auto tsv8_y = "$\"$\"\"\n";
577     auto tsv9_y = "ab, de\tfg\"|--|hij\n";
578     auto tsv10_y = "";
579     auto tsv11_y = "$\n";
580     auto tsv12_y = "$$\n";
581     auto tsv13_y = "|--|$|--||--|$|--||--||--|\n";
582     auto tsv14_y = "|--|$|--||--|$|--||--||--|\n";
583     auto tsv15_y = "ab, de\tfg\"|--|hij\n";
584     auto tsv16_y = "ab, de\tfg\"|--|hij\n";
585     auto tsv17_y = "ab\"$ab\"cd\n";
586     auto tsv18_y = "\n\n\n";
587     auto tsv19_y = "\t\n";
588     auto tsv20_y = "\t\t\n";
589     auto tsv21_y = "a\n";
590     auto tsv22_y = "a$\n";
591     auto tsv23_y = "a$b\n";
592     auto tsv24_y = "$\n";
593     auto tsv25_y = "#\n";
594     auto tsv26_y = "^\n";
595     auto tsv27_y = "#^#\n";
596     auto tsv28_y = "^#^\n";
597     auto tsv29_y = "|--|\n";
598     auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n";
599     auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
600     auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
601 
602     auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a,
603                      csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a,
604                      csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a,
605                      csv31a, csv32a];
606 
607     auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b,
608                      csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b,
609                      csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b,
610                      csv31b, csv32b];
611 
612     auto tsvSet1  = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10,
613                      tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20,
614                      tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30,
615                      tsv31, tsv32];
616 
617     auto tsvSet1_x  = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x,
618                        tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x,
619                        tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x,
620                        tsv31_x, tsv32_x];
621 
622     auto tsvSet1_y  = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y,
623                        tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y,
624                        tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y,
625                        tsv31_y, tsv32_y];
626 
627     foreach (i, csva, csvb, tsv, tsv_x, tsv_y; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y))
628     {
629         import std.conv : to;
630 
631         /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
632         ubyte[] csvInputA = cast(ubyte[])csva;
633         ubyte[] csvInputB = cast(ubyte[])csvb;
634 
635         /* CSV Set A vs TSV expected. */
636         auto tsvResultA = appender!(char[])();
637         csv2tsv(csvInputA, tsvResultA, "csvInputA_defaultTSV", i);
638         assert(tsv == tsvResultA.data,
639                format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
640                       i + 1, csva, tsv, tsvResultA.data));
641 
642         /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/
643         auto tsvResultB = appender!(char[])();
644         csv2tsv(csvInputB, tsvResultB, "csvInputB_defaultTSV", i, '#', '^');
645         assert(tsv == tsvResultB.data,
646                format("Unittest failure. tsv != tsvResultB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
647                       i + 1, csvb, tsv, tsvResultB.data));
648 
649         /* CSV Set A and TSV with $ separator.*/
650         csvInputA = cast(ubyte[])csva;
651         auto tsvResult_XA = appender!(char[])();
652         csv2tsv(csvInputA, tsvResult_XA, "csvInputA_TSV_WithDollarDelimiter", i, '"', ',', '$');
653         assert(tsv_x == tsvResult_XA.data,
654                format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
655                       i + 1, csva, tsv_x, tsvResult_XA.data));
656 
657         /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/
658         csvInputB = cast(ubyte[])csvb;
659         auto tsvResult_XB = appender!(char[])();
660         csv2tsv(csvInputB, tsvResult_XB, "csvInputB__TSV_WithDollarDelimiter", i, '#', '^', '$');
661         assert(tsv_x == tsvResult_XB.data,
662                format("Unittest failure. tsv_x != tsvResult_XB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
663                       i + 1, csvb, tsv_x, tsvResult_XB.data));
664 
665         /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */
666         csvInputA = cast(ubyte[])csva;
667         auto tsvResult_YA = appender!(char[])();
668         csv2tsv(csvInputA, tsvResult_YA, "csvInputA_TSV_WithDollarAndDelimReplacement", i, '"', ',', '$', "|--|");
669         assert(tsv_y == tsvResult_YA.data,
670                format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
671                       i + 1, csva, tsv_y, tsvResult_YA.data));
672 
673         /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/
674         csvInputB = cast(ubyte[])csvb;
675         auto tsvResult_YB = appender!(char[])();
676         csv2tsv(csvInputB, tsvResult_YB, "csvInputB__TSV_WithDollarAndDelimReplacement", i, '#', '^', '$', "|--|");
677         assert(tsv_y == tsvResult_YB.data,
678                format("Unittest failure. tsv_y != tsvResult_YB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
679                       i + 1, csvb, tsv_y, tsvResult_YB.data));
680 
681     }
682 }
683 
684 unittest
685 {
686     /* Unit tests for 'maxRecords' feature of the csv2tsv function.
687      */
688 
689     /* Input CSV. */
690     auto csv1 = "";
691     auto csv2 = ",";
692     auto csv3 = "a";
693     auto csv4 = "a\n";
694     auto csv5 = "a\nb";
695     auto csv6 = "a\nb\n";
696     auto csv7 = "a\nb\nc";
697     auto csv8 = "a\nb\nc\n";
698     auto csv9 = "a,aa";
699     auto csv10 = "a,aa\n";
700     auto csv11 = "a,aa\nb,bb";
701     auto csv12 = "a,aa\nb,bb\n";
702     auto csv13 = "a,aa\nb,bb\nc,cc";
703     auto csv14 = "a,aa\nb,bb\nc,cc\n";
704 
705     auto csv15 = "\"a\",\"aa\"";
706     auto csv16 = "\"a\",\"aa\"\n";
707     auto csv17 = "\"a\",\"aa\"\n\"b\",\"bb\"";
708     auto csv18 = "\"a\",\"aa\"\n\"b\",\"bb\"\n";
709     auto csv19 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"";
710     auto csv20 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"\n";
711 
712     /* TSV with max 1 record. */
713     auto tsv1_max1 = "";
714     auto tsv2_max1 = "\t\n";
715     auto tsv3_max1 = "a\n";
716     auto tsv4_max1 = "a\n";
717     auto tsv5_max1 = "a\n";
718     auto tsv6_max1 = "a\n";
719     auto tsv7_max1 = "a\n";
720     auto tsv8_max1 = "a\n";
721     auto tsv9_max1 = "a\taa\n";
722     auto tsv10_max1 = "a\taa\n";
723     auto tsv11_max1 = "a\taa\n";
724     auto tsv12_max1 = "a\taa\n";
725     auto tsv13_max1 = "a\taa\n";
726     auto tsv14_max1 = "a\taa\n";
727 
728     auto tsv15_max1 = "a\taa\n";
729     auto tsv16_max1 = "a\taa\n";
730     auto tsv17_max1 = "a\taa\n";
731     auto tsv18_max1 = "a\taa\n";
732     auto tsv19_max1 = "a\taa\n";
733     auto tsv20_max1 = "a\taa\n";
734 
735     /* Remaining TSV converted after first call. */
736     auto tsv1_max1_rest = "";
737     auto tsv2_max1_rest = "";
738     auto tsv3_max1_rest = "";
739     auto tsv4_max1_rest = "";
740     auto tsv5_max1_rest = "b\n";
741     auto tsv6_max1_rest = "b\n";
742     auto tsv7_max1_rest = "b\nc\n";
743     auto tsv8_max1_rest = "b\nc\n";
744     auto tsv9_max1_rest = "";
745     auto tsv10_max1_rest = "";
746     auto tsv11_max1_rest = "b\tbb\n";
747     auto tsv12_max1_rest = "b\tbb\n";
748     auto tsv13_max1_rest = "b\tbb\nc\tcc\n";
749     auto tsv14_max1_rest = "b\tbb\nc\tcc\n";
750 
751     auto tsv15_max1_rest = "";
752     auto tsv16_max1_rest = "";
753     auto tsv17_max1_rest = "b\tbb\n";
754     auto tsv18_max1_rest = "b\tbb\n";
755     auto tsv19_max1_rest = "b\tbb\nc\tcc\n";
756     auto tsv20_max1_rest = "b\tbb\nc\tcc\n";
757 
758     /* TSV with max 2 records. */
759     auto tsv1_max2 = "";
760     auto tsv2_max2 = "\t\n";
761     auto tsv3_max2 = "a\n";
762     auto tsv4_max2 = "a\n";
763     auto tsv5_max2 = "a\nb\n";
764     auto tsv6_max2 = "a\nb\n";
765     auto tsv7_max2 = "a\nb\n";
766     auto tsv8_max2 = "a\nb\n";
767     auto tsv9_max2 = "a\taa\n";
768     auto tsv10_max2 = "a\taa\n";
769     auto tsv11_max2 = "a\taa\nb\tbb\n";
770     auto tsv12_max2 = "a\taa\nb\tbb\n";
771     auto tsv13_max2 = "a\taa\nb\tbb\n";
772     auto tsv14_max2 = "a\taa\nb\tbb\n";
773 
774     auto tsv15_max2 = "a\taa\n";
775     auto tsv16_max2 = "a\taa\n";
776     auto tsv17_max2 = "a\taa\nb\tbb\n";
777     auto tsv18_max2 = "a\taa\nb\tbb\n";
778     auto tsv19_max2 = "a\taa\nb\tbb\n";
779     auto tsv20_max2 = "a\taa\nb\tbb\n";
780 
781     /* Remaining TSV converted after first call. */
782     auto tsv1_max2_rest = "";
783     auto tsv2_max2_rest = "";
784     auto tsv3_max2_rest = "";
785     auto tsv4_max2_rest = "";
786     auto tsv5_max2_rest = "";
787     auto tsv6_max2_rest = "";
788     auto tsv7_max2_rest = "c\n";
789     auto tsv8_max2_rest = "c\n";
790     auto tsv9_max2_rest = "";
791     auto tsv10_max2_rest = "";
792     auto tsv11_max2_rest = "";
793     auto tsv12_max2_rest = "";
794     auto tsv13_max2_rest = "c\tcc\n";
795     auto tsv14_max2_rest = "c\tcc\n";
796 
797     auto tsv15_max2_rest = "";
798     auto tsv16_max2_rest = "";
799     auto tsv17_max2_rest = "";
800     auto tsv18_max2_rest = "";
801     auto tsv19_max2_rest = "c\tcc\n";
802     auto tsv20_max2_rest = "c\tcc\n";
803 
804     auto csvSet1 =
805         [csv1, csv2, csv3, csv4, csv5, csv6, csv7,
806          csv8, csv9, csv10, csv11, csv12, csv13, csv14,
807          csv15, csv16, csv17, csv18, csv19, csv20 ];
808 
809     auto tsvMax1Set1 =
810         [tsv1_max1, tsv2_max1, tsv3_max1, tsv4_max1, tsv5_max1, tsv6_max1, tsv7_max1,
811          tsv8_max1, tsv9_max1, tsv10_max1, tsv11_max1, tsv12_max1, tsv13_max1, tsv14_max1,
812          tsv15_max1, tsv16_max1, tsv17_max1, tsv18_max1, tsv19_max1, tsv20_max1];
813 
814     auto tsvMax1RestSet1 =
815         [tsv1_max1_rest, tsv2_max1_rest, tsv3_max1_rest, tsv4_max1_rest, tsv5_max1_rest, tsv6_max1_rest, tsv7_max1_rest,
816          tsv8_max1_rest, tsv9_max1_rest, tsv10_max1_rest, tsv11_max1_rest, tsv12_max1_rest, tsv13_max1_rest, tsv14_max1_rest,
817          tsv15_max1_rest, tsv16_max1_rest, tsv17_max1_rest, tsv18_max1_rest, tsv19_max1_rest, tsv20_max1_rest];
818 
819     auto tsvMax2Set1 =
820         [tsv1_max2, tsv2_max2, tsv3_max2, tsv4_max2, tsv5_max2, tsv6_max2, tsv7_max2,
821          tsv8_max2, tsv9_max2, tsv10_max2, tsv11_max2, tsv12_max2, tsv13_max2, tsv14_max2,
822          tsv15_max2, tsv16_max2, tsv17_max2, tsv18_max2, tsv19_max2, tsv20_max2];
823 
824     auto tsvMax2RestSet1 =
825         [tsv1_max2_rest, tsv2_max2_rest, tsv3_max2_rest, tsv4_max2_rest, tsv5_max2_rest, tsv6_max2_rest, tsv7_max2_rest,
826          tsv8_max2_rest, tsv9_max2_rest, tsv10_max2_rest, tsv11_max2_rest, tsv12_max2_rest, tsv13_max2_rest, tsv14_max2_rest,
827          tsv15_max2_rest, tsv16_max2_rest, tsv17_max2_rest, tsv18_max2_rest, tsv19_max2_rest, tsv20_max2_rest];
828 
829     foreach (i, csv, tsv_max1, tsv_max1_rest, tsv_max2, tsv_max2_rest;
830              lockstep(csvSet1, tsvMax1Set1, tsvMax1RestSet1, tsvMax2Set1, tsvMax2RestSet1))
831     {
832         /* Byte stream for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
833         ubyte[] csvInput = cast(ubyte[])csv;
834 
835         /* Call with maxRecords == 1. */
836         auto tsvMax1Result = appender!(char[])();
837         csv2tsv(csvInput, tsvMax1Result, "maxRecords-one", i, '"', ',', '\t', " ", NullableSizeT(1));
838         assert(tsv_max1 == tsvMax1Result.data,
839                format("Unittest failure. tsv_max1 != tsvMax1Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
840                       i + 1, csv, tsv_max1, tsvMax1Result.data));
841 
842         /* Follow-up call getting all records remaining after the maxRecords==1 call. */
843         auto tsvMax1RestResult = appender!(char[])();
844         csv2tsv(csvInput, tsvMax1RestResult, "maxRecords-one-followup", i);
845         assert(tsv_max1_rest == tsvMax1RestResult.data,
846                format("Unittest failure. tsv_max1_rest != tsvMax1RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
847                       i + 1, csv, tsv_max1_rest, tsvMax1RestResult.data));
848 
849         /* Reset the input stream for maxRecords == 2. */
850         csvInput = cast(ubyte[])csv;
851 
852         /* Call with maxRecords == 2. */
853         auto tsvMax2Result = appender!(char[])();
854         csv2tsv(csvInput, tsvMax2Result, "maxRecords-two", i, '"', ',', '\t', " ", NullableSizeT(2));
855         assert(tsv_max2 == tsvMax2Result.data,
856                format("Unittest failure. tsv_max2 != tsvMax2Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
857                       i + 1, csv, tsv_max2, tsvMax2Result.data));
858 
859         /* Follow-up call getting all records remaining after the maxRecords==2 call. */
860         auto tsvMax2RestResult = appender!(char[])();
861         csv2tsv(csvInput, tsvMax2RestResult, "maxRecords-two-followup", i);
862         assert(tsv_max2_rest == tsvMax2RestResult.data,
863                format("Unittest failure. tsv_max2_rest != tsvMax2RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
864                       i + 1, csv, tsv_max2_rest, tsvMax2RestResult.data));
865     }
866 }