1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2021, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_append;
11 
12 import std.conv : to;
13 import std.exception : enforce;
14 import std.range;
15 import std.stdio;
16 import std.typecons : tuple;
17 
18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
19 
20 version(unittest)
21 {
22     // When running unit tests, use main from -main compiler switch.
23 }
24 else
25 {
26     /** Main program. Invokes command line arg processing and tsv-append to perform
27      * the real work. Any errors are caught and reported.
28      */
29     int main(string[] cmdArgs)
30     {
31         import tsv_utils.common.utils : BufferedOutputRange, LineBuffered;
32 
33         /* When running in DMD code coverage mode, turn on report merging. */
34         version(D_Coverage) version(DigitalMars)
35         {
36             import core.runtime : dmd_coverSetMerge;
37             dmd_coverSetMerge(true);
38         }
39 
40         TsvAppendOptions cmdopt;
41         auto r = cmdopt.processArgs(cmdArgs);
42         if (!r[0]) return r[1];
43 
44         immutable LineBuffered linebuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
45 
46         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout, linebuffered));
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpTextVerbose = q"EOS
57 Synopsis: tsv-append [options] [file...]
58 
59 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
60 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
61 the first file. It also supports source tracking, adding a column indicating
62 the original file to each row. Results are written to standard output.
63 
64 Concatenation with header support is useful when preparing data for traditional
65 Unix utilities like 'sort' and 'sed' or applications that read a single file.
66 
67 Source tracking is useful when creating long/narrow form tabular data, a format
68 used by many statistics and data mining packages. In this scenario, files have
69 been used to capture related data sets, the difference between data sets being a
70 condition represented by the file. For example, results from different variants
71 of an experiment might each be recorded in their own files. Retaining the source
72 file as an output column preserves the condition represented by the file.
73 
74 The file-name (without extension) is used as the source value. This can
75 customized using the --f|file option.
76 
77 Example: Header processing:
78 
79    $ tsv-append -H file1.tsv file2.tsv file3.tsv
80 
81 Example: Header processing and source tracking:
82 
83    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
84 
85 Example: Source tracking with custom values:
86 
87    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
88 
89 Options:
90 EOS";
91 
92 auto helpText = q"EOS
93 Synopsis: tsv-append [options] [file...]
94 
95 tsv-append concatenates multiple TSV files, reading from files or standard input
96 and writing to standard output. It is header aware ('--H|header'), writing the
97 header from only the first file. It also supports source tracking, adding an
98 indicator of original file to each row of input.
99 
100 Options:
101 EOS";
102 
103 /** Container for command line options.
104 */
105 struct TsvAppendOptions
106 {
107     string programName;
108     string[] files;                    /// Input files
109     string[string] fileSourceNames;    /// Maps file path to the 'source' value
110     string sourceHeader;               /// --s|source-header
111     bool trackSource = false;          /// --t|track-source
112     bool hasHeader = false;            /// --H|header
113     char delim = '\t';                 /// --d|delimiter
114     bool lineBuffered = false;         /// --line-buffered
115 
116     /* fileOptionHandler processes the '--f|file source=file' option. */
117     private void fileOptionHandler(string option, string optionVal) pure @safe
118     {
119         import std.algorithm : findSplit;
120         import std.format : format;
121 
122         auto valSplit = findSplit(optionVal, "=");
123 
124         enforce(!valSplit[0].empty && !valSplit[2].empty,
125                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
126                        option, optionVal, option));
127 
128         auto source = valSplit[0];
129         auto filepath = valSplit[2];
130         files ~= filepath;
131         fileSourceNames[filepath] = source;
132     }
133 
134     /** Command line argument processing.
135      *
136      * Returns a tuple. First value is true if command line arguments were successfully
137      * processed and execution should continue, or false if an error occurred or the user
138      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
139      *
140      * Returning true (execution continues) means args have been validated and derived
141      * values calculated. In addition, field indices have been converted to zero-based.
142      * If the whole line is the key, the individual fields list will be cleared.
143      */
144     auto processArgs (ref string[] cmdArgs)
145     {
146         import std.algorithm : any, each;
147         import std.getopt;
148         import std.path : baseName, stripExtension;
149 
150         bool helpVerbose = false;          // --help-verbose
151         bool versionWanted = false;        // --V|version
152 
153         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
154 
155         try
156         {
157             arraySep = ",";    // Use comma to separate values in command line options
158             auto r = getopt(
159                 cmdArgs,
160                 "help-verbose",    "          Print full help.", &helpVerbose,
161                 std.getopt.config.caseSensitive,
162                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
163                 std.getopt.config.caseInsensitive,
164                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
165                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
166                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
167                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
168                 "line-buffered",   "          Immediately output every line.", &lineBuffered,
169                 std.getopt.config.caseSensitive,
170                 "V|version",       "          Print version information and exit.", &versionWanted,
171                 std.getopt.config.caseInsensitive,
172                 );
173 
174             if (r.helpWanted)
175             {
176                 defaultGetoptPrinter(helpText, r.options);
177                 return tuple(false, 0);
178             }
179             else if (helpVerbose)
180             {
181                 defaultGetoptPrinter(helpTextVerbose, r.options);
182                 return tuple(false, 0);
183             }
184             else if (versionWanted)
185             {
186                 import tsv_utils.common.tsvutils_version;
187                 writeln(tsvutilsVersionNotice("tsv-append"));
188                 return tuple(false, 0);
189             }
190 
191             /* Derivations and consistency checks. */
192             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
193             if (!sourceHeader.empty) hasHeader = true;
194             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
195 
196             /* Assume the remaing arguments are filepaths. */
197             foreach (fp; cmdArgs[1 .. $])
198             {
199                 import std.path : baseName, stripExtension;
200                 files ~= fp;
201                 fileSourceNames[fp] = fp.stripExtension.baseName;
202             }
203 
204             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
205             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
206         }
207         catch (Exception exc)
208         {
209             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
210             return tuple(false, 1);
211         }
212         return tuple(true, 0);
213     }
214 }
215 
216 /** tsvAppend implements the basic functionality of the tsv-append program.
217  */
218 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
219 if (isOutputRange!(OutputRange, char))
220 {
221     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, LineBuffered,
222         ReadHeader;
223 
224     immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
225     immutable ReadHeader useReadHeader = cmdopt.hasHeader ? Yes.readHeader : No.readHeader;
226 
227     bool headerWritten = false;
228     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
229     {
230         auto inputStream = (filename == "-") ? stdin : filename.File();
231         auto sourceName = cmdopt.fileSourceNames[filename];
232         foreach (fileLineNum, line;
233                  inputStream
234                  .bufferedByLine!(KeepTerminator.no)(isLineBuffered, useReadHeader)
235                  .enumerate(1))
236         {
237             if (cmdopt.hasHeader && fileLineNum == 1)
238             {
239                 if (!headerWritten)
240                 {
241                     if (cmdopt.trackSource)
242                     {
243                         outputStream.put(cmdopt.sourceHeader);
244                         outputStream.put(cmdopt.delim);
245                     }
246                     outputStream.put(line);
247                     outputStream.put('\n');
248                     headerWritten = true;
249 
250                     /* Flush the header immediately. This helps tasks further on in a
251                      * unix pipeline detect errors quickly, without waiting for all
252                      * the data to flow through the pipeline. Note that an upstream
253                      * task may have flushed its header line, so the header may
254                      * arrive long before the main block of data.
255                      */
256                     static if (isFlushableOutputRange!OutputRange) outputStream.flush;
257                 }
258             }
259             else
260             {
261                 if (cmdopt.trackSource)
262                 {
263                     outputStream.put(sourceName);
264                     outputStream.put(cmdopt.delim);
265                 }
266                 outputStream.put(line);
267                 outputStream.put('\n');
268             }
269         }
270         /* Files don't always close quickly enough on thier own. */
271         if (filename != "-") inputStream.close;
272     }
273 }
274 
275 version(unittest)
276 {
277     /* Unit test helper functions. */
278 
279     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
280 
281     void testTsvAppend(string[] cmdArgs, string[][] expected)
282     {
283         import std.array : appender;
284         import std.format : format;
285 
286         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
287 
288         auto formatAssertMessage(T...)(string msg, T formatArgs)
289         {
290             auto formatString = "[testTsvAppend] %s: " ~ msg;
291             return format(formatString, cmdArgs[0], formatArgs);
292         }
293 
294         TsvAppendOptions cmdopt;
295         auto savedCmdArgs = cmdArgs.to!string;
296         auto r = cmdopt.processArgs(cmdArgs);
297         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
298 
299         auto output = appender!(char[])();
300         tsvAppend(cmdopt, output);
301         auto expectedOutput = expected.tsvDataToString;
302 
303         assert(output.data == expectedOutput,
304                formatAssertMessage(
305                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
306                    expectedOutput.to!string, output.data.to!string));
307     }
308  }
309 
310 unittest
311 {
312     import std.path : buildPath;
313     import std.file : rmdirRecurse;
314     import std.format : format;
315 
316     auto testDir = makeUnittestTempDir("tsv_append");
317     scope(exit) testDir.rmdirRecurse;
318 
319     string[][] data1 =
320         [["field_a", "field_b", "field_c"],
321          ["red", "17", "κόκκινος"],
322          ["blue", "12", "άσπρο"]];
323 
324     string[][] data2 =
325         [["field_a", "field_b", "field_c"],
326          ["green", "13.5", "κόκκινος"],
327          ["blue", "15", "πράσινος"]];
328 
329     string[][] data3 =
330         [["field_a", "field_b", "field_c"],
331          ["yellow", "9", "κίτρινος"]];
332 
333     string[][] dataHeaderRowOnly =
334         [["field_a", "field_b", "field_c"]];
335 
336     string[][] dataEmpty = [[]];
337 
338     string filepath1 = buildPath(testDir, "file1.tsv");
339     string filepath2 = buildPath(testDir, "file2.tsv");
340     string filepath3 = buildPath(testDir, "file3.tsv");
341     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
342     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
343 
344     writeUnittestTsvFile(filepath1, data1);
345     writeUnittestTsvFile(filepath2, data2);
346     writeUnittestTsvFile(filepath3, data3);
347     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
348     writeUnittestTsvFile(filepathEmpty, dataEmpty);
349 
350     testTsvAppend(["test-1", filepath1], data1);
351     testTsvAppend(["test-2", "--header", filepath1], data1);
352     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
353 
354     testTsvAppend(["test-4", "--header", filepath1, filepath2],
355                   [["field_a", "field_b", "field_c"],
356                    ["red", "17", "κόκκινος"],
357                    ["blue", "12", "άσπρο"],
358                    ["green", "13.5", "κόκκινος"],
359                    ["blue", "15", "πράσινος"]]);
360 
361     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
362                   [["field_a", "field_b", "field_c"],
363                    ["red", "17", "κόκκινος"],
364                    ["blue", "12", "άσπρο"],
365                    ["green", "13.5", "κόκκινος"],
366                    ["blue", "15", "πράσινος"],
367                    ["yellow", "9", "κίτρινος"]]);
368 
369     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
370                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
371 
372     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
373                   [["field_a", "field_b", "field_c"],
374                    ["red", "17", "κόκκινος"],
375                    ["blue", "12", "άσπρο"],
376                    ["green", "13.5", "κόκκινος"],
377                    ["blue", "15", "πράσινος"],
378                    ["yellow", "9", "κίτρινος"]]);
379 
380     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
381                   [["file1", "field_a", "field_b", "field_c"],
382                    ["file1", "red", "17", "κόκκινος"],
383                    ["file1", "blue", "12", "άσπρο"],
384                    ["file2", "field_a", "field_b", "field_c"],
385                    ["file2", "green", "13.5", "κόκκινος"],
386                    ["file2", "blue", "15", "πράσινος"]]);
387 
388     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
389                   [["file", "field_a", "field_b", "field_c"],
390                    ["file1", "red", "17", "κόκκινος"],
391                    ["file1", "blue", "12", "άσπρο"],
392                    ["file2", "green", "13.5", "κόκκινος"],
393                    ["file2", "blue", "15", "πράσινος"]]);
394 
395     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
396                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
397                   [["source", "field_a", "field_b", "field_c"],
398                    ["file1", "red", "17", "κόκκινος"],
399                    ["file1", "blue", "12", "άσπρο"],
400                    ["file2", "green", "13.5", "κόκκινος"],
401                    ["file2", "blue", "15", "πράσινος"],
402                    ["file3", "yellow", "9", "κίτρινος"]]);
403 
404     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
405                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
406                   [["id", "field_a", "field_b", "field_c"],
407                    ["1a", "red", "17", "κόκκινος"],
408                    ["1a", "blue", "12", "άσπρο"],
409                    ["1b", "green", "13.5", "κόκκινος"],
410                    ["1b", "blue", "15", "πράσινος"],
411                    ["1c", "yellow", "9", "κίτρινος"]]);
412 
413     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
414                    "-f", format("1b=%s", filepath2), filepath3],
415                   [["id", "field_a", "field_b", "field_c"],
416                    ["1a", "red", "17", "κόκκινος"],
417                    ["1a", "blue", "12", "άσπρο"],
418                    ["1b", "green", "13.5", "κόκκινος"],
419                    ["1b", "blue", "15", "πράσινος"],
420                    ["file3", "yellow", "9", "κίτρινος"]]);
421 
422     testTsvAppend(["test-13", "--line-buffered", filepath1], data1);
423     testTsvAppend(["test-14", "--line-buffered", "--header", filepath1], data1);
424     testTsvAppend(["test-15", "--line-buffered", filepath1, filepath2], data1 ~ data2);
425     testTsvAppend(["test-16", "--line-buffered", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
426                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
427 
428     testTsvAppend(["test-17", "--line-buffered", "-s", "id", "-f", format("1a=%s", filepath1),
429                    "-f", format("1b=%s", filepath2), filepath3],
430                   [["id", "field_a", "field_b", "field_c"],
431                    ["1a", "red", "17", "κόκκινος"],
432                    ["1a", "blue", "12", "άσπρο"],
433                    ["1b", "green", "13.5", "κόκκινος"],
434                    ["1b", "blue", "15", "πράσινος"],
435                    ["file3", "yellow", "9", "κίτρινος"]]);
436 }