1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2020, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_append;
11 
12 import std.conv : to;
13 import std.exception : enforce;
14 import std.range;
15 import std.stdio;
16 import std.typecons : tuple;
17 
18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
19 
20 version(unittest)
21 {
22     // When running unit tests, use main from -main compiler switch.
23 }
24 else
25 {
26     /** Main program. Invokes command line arg processing and tsv-append to perform
27      * the real work. Any errors are caught and reported.
28      */
29     int main(string[] cmdArgs)
30     {
31         import tsv_utils.common.utils : BufferedOutputRange;
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvAppendOptions cmdopt;
40         auto r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout));
43         catch (Exception exc)
44         {
45             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
46             return 1;
47         }
48         return 0;
49     }
50 }
51 
52 auto helpTextVerbose = q"EOS
53 Synopsis: tsv-append [options] [file...]
54 
55 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
56 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
57 the first file. It also supports source tracking, adding a column indicating
58 the original file to each row. Results are written to standard output.
59 
60 Concatenation with header support is useful when preparing data for traditional
61 Unix utilities like 'sort' and 'sed' or applications that read a single file.
62 
63 Source tracking is useful when creating long/narrow form tabular data, a format
64 used by many statistics and data mining packages. In this scenario, files have
65 been used to capture related data sets, the difference between data sets being a
66 condition represented by the file. For example, results from different variants
67 of an experiment might each be recorded in their own files. Retaining the source
68 file as an output column preserves the condition represented by the file.
69 
70 The file-name (without extension) is used as the source value. This can
71 customized using the --f|file option.
72 
73 Example: Header processing:
74 
75    $ tsv-append -H file1.tsv file2.tsv file3.tsv
76 
77 Example: Header processing and source tracking:
78 
79    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
80 
81 Example: Source tracking with custom values:
82 
83    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
84 
85 Options:
86 EOS";
87 
88 auto helpText = q"EOS
89 Synopsis: tsv-append [options] [file...]
90 
91 tsv-append concatenates multiple TSV files, reading from files or standard input
92 and writing to standard output. It is header aware ('--H|header'), writing the
93 header from only the first file. It also supports source tracking, adding an
94 indicator of original file to each row of input.
95 
96 Options:
97 EOS";
98 
99 /** Container for command line options.
100 */
101 struct TsvAppendOptions
102 {
103     string programName;
104     string[] files;                    /// Input files
105     string[string] fileSourceNames;    /// Maps file path to the 'source' value
106     string sourceHeader;               /// --s|source-header
107     bool trackSource = false;          /// --t|track-source
108     bool hasHeader = false;            /// --H|header
109     char delim = '\t';                 /// --d|delimiter
110 
111     /* fileOptionHandler processes the '--f|file source=file' option. */
112     private void fileOptionHandler(string option, string optionVal) pure @safe
113     {
114         import std.algorithm : findSplit;
115         import std.format : format;
116 
117         auto valSplit = findSplit(optionVal, "=");
118 
119         enforce(!valSplit[0].empty && !valSplit[2].empty,
120                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
121                        option, optionVal, option));
122 
123         auto source = valSplit[0];
124         auto filepath = valSplit[2];
125         files ~= filepath;
126         fileSourceNames[filepath] = source;
127     }
128 
129     /** Command line argument processing.
130      *
131      * Returns a tuple. First value is true if command line arguments were successfully
132      * processed and execution should continue, or false if an error occurred or the user
133      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
134      *
135      * Returning true (execution continues) means args have been validated and derived
136      * values calculated. In addition, field indices have been converted to zero-based.
137      * If the whole line is the key, the individual fields list will be cleared.
138      */
139     auto processArgs (ref string[] cmdArgs)
140     {
141         import std.algorithm : any, each;
142         import std.getopt;
143         import std.path : baseName, stripExtension;
144 
145         bool helpVerbose = false;          // --help-verbose
146         bool versionWanted = false;        // --V|version
147 
148         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
149 
150         try
151         {
152             arraySep = ",";    // Use comma to separate values in command line options
153             auto r = getopt(
154                 cmdArgs,
155                 "help-verbose",    "          Print full help.", &helpVerbose,
156                 std.getopt.config.caseSensitive,
157                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
158                 std.getopt.config.caseInsensitive,
159                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
160                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
161                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
162                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
163                 std.getopt.config.caseSensitive,
164                 "V|version",       "          Print version information and exit.", &versionWanted,
165                 std.getopt.config.caseInsensitive,
166                 );
167 
168             if (r.helpWanted)
169             {
170                 defaultGetoptPrinter(helpText, r.options);
171                 return tuple(false, 0);
172             }
173             else if (helpVerbose)
174             {
175                 defaultGetoptPrinter(helpTextVerbose, r.options);
176                 return tuple(false, 0);
177             }
178             else if (versionWanted)
179             {
180                 import tsv_utils.common.tsvutils_version;
181                 writeln(tsvutilsVersionNotice("tsv-append"));
182                 return tuple(false, 0);
183             }
184 
185             /* Derivations and consistency checks. */
186             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
187             if (!sourceHeader.empty) hasHeader = true;
188             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
189 
190             /* Assume the remaing arguments are filepaths. */
191             foreach (fp; cmdArgs[1 .. $])
192             {
193                 import std.path : baseName, stripExtension;
194                 files ~= fp;
195                 fileSourceNames[fp] = fp.stripExtension.baseName;
196             }
197 
198             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
199             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
200         }
201         catch (Exception exc)
202         {
203             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
204             return tuple(false, 1);
205         }
206         return tuple(true, 0);
207     }
208 }
209 
210 /** tsvAppend implements the basic functionality of the tsv-append program.
211  */
212 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
213 if (isOutputRange!(OutputRange, char))
214 {
215     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange;
216 
217     bool headerWritten = false;
218     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
219     {
220         auto inputStream = (filename == "-") ? stdin : filename.File();
221         auto sourceName = cmdopt.fileSourceNames[filename];
222         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
223         {
224             if (cmdopt.hasHeader && fileLineNum == 1)
225             {
226                 if (!headerWritten)
227                 {
228                     if (cmdopt.trackSource)
229                     {
230                         outputStream.put(cmdopt.sourceHeader);
231                         outputStream.put(cmdopt.delim);
232                     }
233                     outputStream.put(line);
234                     outputStream.put('\n');
235                     headerWritten = true;
236 
237                     /* Flush the header immediately. This helps tasks further on in a
238                      * unix pipeline detect errors quickly, without waiting for all
239                      * the data to flow through the pipeline. Note that an upstream
240                      * task may have flushed its header line, so the header may
241                      * arrive long before the main block of data.
242                      */
243                     static if (isFlushableOutputRange!OutputRange) outputStream.flush;
244                 }
245             }
246             else
247             {
248                 if (cmdopt.trackSource)
249                 {
250                     outputStream.put(sourceName);
251                     outputStream.put(cmdopt.delim);
252                 }
253                 outputStream.put(line);
254                 outputStream.put('\n');
255             }
256         }
257         /* Files don't always close quickly enough on thier own. */
258         if (filename != "-") inputStream.close;
259     }
260 }
261 
262 version(unittest)
263 {
264     /* Unit test helper functions. */
265 
266     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
267 
268     void testTsvAppend(string[] cmdArgs, string[][] expected)
269     {
270         import std.array : appender;
271         import std.format : format;
272 
273         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
274 
275         auto formatAssertMessage(T...)(string msg, T formatArgs)
276         {
277             auto formatString = "[testTsvAppend] %s: " ~ msg;
278             return format(formatString, cmdArgs[0], formatArgs);
279         }
280 
281         TsvAppendOptions cmdopt;
282         auto savedCmdArgs = cmdArgs.to!string;
283         auto r = cmdopt.processArgs(cmdArgs);
284         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
285 
286         auto output = appender!(char[])();
287         tsvAppend(cmdopt, output);
288         auto expectedOutput = expected.tsvDataToString;
289 
290         assert(output.data == expectedOutput,
291                formatAssertMessage(
292                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
293                    expectedOutput.to!string, output.data.to!string));
294     }
295  }
296 
297 unittest
298 {
299     import std.path : buildPath;
300     import std.file : rmdirRecurse;
301     import std.format : format;
302 
303     auto testDir = makeUnittestTempDir("tsv_append");
304     scope(exit) testDir.rmdirRecurse;
305 
306     string[][] data1 =
307         [["field_a", "field_b", "field_c"],
308          ["red", "17", "κόκκινος"],
309          ["blue", "12", "άσπρο"]];
310 
311     string[][] data2 =
312         [["field_a", "field_b", "field_c"],
313          ["green", "13.5", "κόκκινος"],
314          ["blue", "15", "πράσινος"]];
315 
316     string[][] data3 =
317         [["field_a", "field_b", "field_c"],
318          ["yellow", "9", "κίτρινος"]];
319 
320     string[][] dataHeaderRowOnly =
321         [["field_a", "field_b", "field_c"]];
322 
323     string[][] dataEmpty = [[]];
324 
325     string filepath1 = buildPath(testDir, "file1.tsv");
326     string filepath2 = buildPath(testDir, "file2.tsv");
327     string filepath3 = buildPath(testDir, "file3.tsv");
328     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
329     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
330 
331     writeUnittestTsvFile(filepath1, data1);
332     writeUnittestTsvFile(filepath2, data2);
333     writeUnittestTsvFile(filepath3, data3);
334     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
335     writeUnittestTsvFile(filepathEmpty, dataEmpty);
336 
337     testTsvAppend(["test-1", filepath1], data1);
338     testTsvAppend(["test-2", "--header", filepath1], data1);
339     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
340 
341     testTsvAppend(["test-4", "--header", filepath1, filepath2],
342                   [["field_a", "field_b", "field_c"],
343                    ["red", "17", "κόκκινος"],
344                    ["blue", "12", "άσπρο"],
345                    ["green", "13.5", "κόκκινος"],
346                    ["blue", "15", "πράσινος"]]);
347 
348     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
349                   [["field_a", "field_b", "field_c"],
350                    ["red", "17", "κόκκινος"],
351                    ["blue", "12", "άσπρο"],
352                    ["green", "13.5", "κόκκινος"],
353                    ["blue", "15", "πράσινος"],
354                    ["yellow", "9", "κίτρινος"]]);
355 
356     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
357                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
358 
359     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
360                   [["field_a", "field_b", "field_c"],
361                    ["red", "17", "κόκκινος"],
362                    ["blue", "12", "άσπρο"],
363                    ["green", "13.5", "κόκκινος"],
364                    ["blue", "15", "πράσινος"],
365                    ["yellow", "9", "κίτρινος"]]);
366 
367     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
368                   [["file1", "field_a", "field_b", "field_c"],
369                    ["file1", "red", "17", "κόκκινος"],
370                    ["file1", "blue", "12", "άσπρο"],
371                    ["file2", "field_a", "field_b", "field_c"],
372                    ["file2", "green", "13.5", "κόκκινος"],
373                    ["file2", "blue", "15", "πράσινος"]]);
374 
375     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
376                   [["file", "field_a", "field_b", "field_c"],
377                    ["file1", "red", "17", "κόκκινος"],
378                    ["file1", "blue", "12", "άσπρο"],
379                    ["file2", "green", "13.5", "κόκκινος"],
380                    ["file2", "blue", "15", "πράσινος"]]);
381 
382     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
383                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
384                   [["source", "field_a", "field_b", "field_c"],
385                    ["file1", "red", "17", "κόκκινος"],
386                    ["file1", "blue", "12", "άσπρο"],
387                    ["file2", "green", "13.5", "κόκκινος"],
388                    ["file2", "blue", "15", "πράσινος"],
389                    ["file3", "yellow", "9", "κίτρινος"]]);
390 
391     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
392                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
393                   [["id", "field_a", "field_b", "field_c"],
394                    ["1a", "red", "17", "κόκκινος"],
395                    ["1a", "blue", "12", "άσπρο"],
396                    ["1b", "green", "13.5", "κόκκινος"],
397                    ["1b", "blue", "15", "πράσινος"],
398                    ["1c", "yellow", "9", "κίτρινος"]]);
399 
400     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
401                    "-f", format("1b=%s", filepath2), filepath3],
402                   [["id", "field_a", "field_b", "field_c"],
403                    ["1a", "red", "17", "κόκκινος"],
404                    ["1a", "blue", "12", "άσπρο"],
405                    ["1b", "green", "13.5", "κόκκινος"],
406                    ["1b", "blue", "15", "πράσινος"],
407                    ["file3", "yellow", "9", "κίτρινος"]]);
408 }