1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2018, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_append;
11 
12 import std.conv : to;
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple;
16 
17 version(unittest)
18 {
19     // When running unit tests, use main from -main compiler switch.
20 }
21 else
22 {
23     /** Main program. Invokes command line arg processing and tsv-append to perform
24      * the real work. Any errors are caught and reported.
25      */
26     int main(string[] cmdArgs)
27     {
28         import tsvutil : BufferedOutputRange;
29         /* When running in DMD code coverage mode, turn on report merging. */
30         version(D_Coverage) version(DigitalMars)
31         {
32             import core.runtime : dmd_coverSetMerge;
33             dmd_coverSetMerge(true);
34         }
35 
36         TsvAppendOptions cmdopt;
37         auto r = cmdopt.processArgs(cmdArgs);
38         if (!r[0]) return r[1];
39         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout));
40         catch (Exception exc)
41         {
42             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
43             return 1;
44         }
45         return 0;
46     }
47 }
48 
49 auto helpTextVerbose = q"EOS
50 Synopsis: tsv-append [options] [file...]
51 
52 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
53 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
54 the first file. It also supports source tracking, adding a column indicating
55 the original file to each row. Results are written to standard output.
56 
57 Concatenation with header support is useful when preparing data for traditional
58 Unix utilities like 'sort' and 'sed' or applications that read a single file.
59 
60 Source tracking is useful when creating long/narrow form tabular data, a format
61 used by many statistics and data mining packages. In this scenario, files have
62 been used to capture related data sets, the difference between data sets being a
63 condition represented by the file. For example, results from different variants
64 of an experiment might each be recorded in their own files. Retaining the source
65 file as an output column preserves the condition represented by the file.
66 
67 The file-name (without extension) is used as the source value. This can
68 customized using the --f|file option.
69 
70 Example: Header processing:
71 
72    $ tsv-append -H file1.tsv file2.tsv file3.tsv
73 
74 Example: Header processing and source tracking:
75 
76    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
77 
78 Example: Source tracking with custom values:
79 
80    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
81 
82 Options:
83 EOS";
84 
85 auto helpText = q"EOS
86 Synopsis: tsv-append [options] [file...]
87 
88 tsv-append concatenates multiple TSV files, reading from files or standard input
89 and writing to standard output. It is header aware ('--H|header'), writing the
90 header from only the first file. It also supports source tracking, adding an
91 indicator of original file to each row of input.
92 
93 Options:
94 EOS";
95 
96 /** Container for command line options.
97 */
98 struct TsvAppendOptions
99 {
100     string programName;
101     string[] files;                    // Input files
102     string[string] fileSourceNames;    // Maps file path to the 'source' value
103     bool helpVerbose = false;          // --help-verbose
104     string sourceHeader;               // --s|source-header
105     bool trackSource = false;          // --t|track-source
106     bool hasHeader = false;            // --H|header
107     char delim = '\t';                 // --d|delimiter
108     bool versionWanted = false;        // --V|version
109 
110     /* fileOptionHandler processes the '--f|file source=file' option. */
111     private void fileOptionHandler(string option, string optionVal)
112     {
113         import std.algorithm : findSplit;
114         import std.format : format;
115 
116         auto valSplit = findSplit(optionVal, "=");
117         if (valSplit[0].empty || valSplit[2].empty)
118             throw new Exception(
119                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
120                        option, optionVal, option));
121 
122         auto source = valSplit[0];
123         auto filepath = valSplit[2];
124         files ~= filepath;
125         fileSourceNames[filepath] = source;
126     }
127 
128     /** Command line argument processing.
129      *
130      * Returns a tuple. First value is true if command line arguments were successfully
131      * processed and execution should continue, or false if an error occurred or the user
132      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
133      *
134      * Returning true (execution continues) means args have been validated and derived
135      * values calculated. In addition, field indices have been converted to zero-based.
136      * If the whole line is the key, the individual fields list will be cleared.
137      */
138     auto processArgs (ref string[] cmdArgs)
139     {
140         import std.algorithm : any, each;
141         import std.getopt;
142         import std.path : baseName, stripExtension;
143 
144         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
145 
146         try
147         {
148             arraySep = ",";    // Use comma to separate values in command line options
149             auto r = getopt(
150                 cmdArgs,
151                 "help-verbose",    "          Print full help.", &helpVerbose,
152                 std.getopt.config.caseSensitive,
153                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
154                 std.getopt.config.caseInsensitive,
155                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
156                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
157                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
158                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
159                 std.getopt.config.caseSensitive,
160                 "V|version",       "          Print version information and exit.", &versionWanted,
161                 std.getopt.config.caseInsensitive,
162                 );
163 
164             if (r.helpWanted)
165             {
166                 defaultGetoptPrinter(helpText, r.options);
167                 return tuple(false, 0);
168             }
169             else if (helpVerbose)
170             {
171                 defaultGetoptPrinter(helpTextVerbose, r.options);
172                 return tuple(false, 0);
173             }
174             else if (versionWanted)
175             {
176                 import tsvutils_version;
177                 writeln(tsvutilsVersionNotice("tsv-append"));
178                 return tuple(false, 0);
179             }
180 
181             /* Derivations and consistency checks. */
182             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
183             if (!sourceHeader.empty) hasHeader = true;
184             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
185 
186             /* Assume the remaing arguments are filepaths. */
187             foreach (fp; cmdArgs[1 .. $])
188             {
189                 import std.path : baseName, stripExtension;
190                 files ~= fp;
191                 fileSourceNames[fp] = fp.stripExtension.baseName;
192             }
193 
194             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
195             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
196         }
197         catch (Exception exc)
198         {
199             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
200             return tuple(false, 1);
201         }
202         return tuple(true, 0);
203     }
204 }
205 
206 /** tsvAppend implements the basic functionality of the tsv-append program.
207  */
208 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
209 if (isOutputRange!(OutputRange, char))
210 {
211     bool headerWritten = false;
212     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
213     {
214         auto inputStream = (filename == "-") ? stdin : filename.File();
215         auto sourceName = cmdopt.fileSourceNames[filename];
216         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
217         {
218             if (cmdopt.hasHeader && fileLineNum == 1)
219             {
220                 if (!headerWritten)
221                 {
222                     if (cmdopt.trackSource)
223                     {
224                         outputStream.put(cmdopt.sourceHeader);
225                         outputStream.put(cmdopt.delim);
226                     }
227                     outputStream.put(line);
228                     outputStream.put('\n');
229                     headerWritten = true;
230                 }
231             }
232             else
233             {
234                 if (cmdopt.trackSource)
235                 {
236                     outputStream.put(sourceName);
237                     outputStream.put(cmdopt.delim);
238                 }
239                 outputStream.put(line);
240                 outputStream.put('\n');
241             }
242         }
243     }
244 }
245 
246 version(unittest)
247 {
248     /* Unit test helper functions. */
249 
250     import unittest_utils;   // tsv unit test helpers, from common/src/.
251 
252     void testTsvAppend(string[] cmdArgs, string[][] expected)
253     {
254         import std.array : appender;
255         import std.format : format;
256 
257         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
258 
259         auto formatAssertMessage(T...)(string msg, T formatArgs)
260         {
261             auto formatString = "[testTsvAppend] %s: " ~ msg;
262             return format(formatString, cmdArgs[0], formatArgs);
263         }
264 
265         TsvAppendOptions cmdopt;
266         auto savedCmdArgs = cmdArgs.to!string;
267         auto r = cmdopt.processArgs(cmdArgs);
268         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
269 
270         auto output = appender!(char[])();
271         tsvAppend(cmdopt, output);
272         auto expectedOutput = expected.tsvDataToString;
273 
274         assert(output.data == expectedOutput,
275                formatAssertMessage(
276                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
277                    expectedOutput.to!string, output.data.to!string));
278     }
279  }
280 
281 unittest
282 {
283     import std.path : buildPath;
284     import std.file : rmdirRecurse;
285     import std.format : format;
286 
287     auto testDir = makeUnittestTempDir("tsv_append");
288     scope(exit) testDir.rmdirRecurse;
289 
290     string[][] data1 =
291         [["field_a", "field_b", "field_c"],
292          ["red", "17", "κόκκινος"],
293          ["blue", "12", "άσπρο"]];
294 
295     string[][] data2 =
296         [["field_a", "field_b", "field_c"],
297          ["green", "13.5", "κόκκινος"],
298          ["blue", "15", "πράσινος"]];
299 
300     string[][] data3 =
301         [["field_a", "field_b", "field_c"],
302          ["yellow", "9", "κίτρινος"]];
303 
304     string[][] dataHeaderRowOnly =
305         [["field_a", "field_b", "field_c"]];
306 
307     string[][] dataEmpty = [[]];
308 
309     string filepath1 = buildPath(testDir, "file1.tsv");
310     string filepath2 = buildPath(testDir, "file2.tsv");
311     string filepath3 = buildPath(testDir, "file3.tsv");
312     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
313     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
314 
315     writeUnittestTsvFile(filepath1, data1);
316     writeUnittestTsvFile(filepath2, data2);
317     writeUnittestTsvFile(filepath3, data3);
318     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
319     writeUnittestTsvFile(filepathEmpty, dataEmpty);
320 
321     testTsvAppend(["test-1", filepath1], data1);
322     testTsvAppend(["test-2", "--header", filepath1], data1);
323     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
324 
325     testTsvAppend(["test-4", "--header", filepath1, filepath2],
326                   [["field_a", "field_b", "field_c"],
327                    ["red", "17", "κόκκινος"],
328                    ["blue", "12", "άσπρο"],
329                    ["green", "13.5", "κόκκινος"],
330                    ["blue", "15", "πράσινος"]]);
331 
332     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
333                   [["field_a", "field_b", "field_c"],
334                    ["red", "17", "κόκκινος"],
335                    ["blue", "12", "άσπρο"],
336                    ["green", "13.5", "κόκκινος"],
337                    ["blue", "15", "πράσινος"],
338                    ["yellow", "9", "κίτρινος"]]);
339 
340     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
341                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
342 
343     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
344                   [["field_a", "field_b", "field_c"],
345                    ["red", "17", "κόκκινος"],
346                    ["blue", "12", "άσπρο"],
347                    ["green", "13.5", "κόκκινος"],
348                    ["blue", "15", "πράσινος"],
349                    ["yellow", "9", "κίτρινος"]]);
350 
351     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
352                   [["file1", "field_a", "field_b", "field_c"],
353                    ["file1", "red", "17", "κόκκινος"],
354                    ["file1", "blue", "12", "άσπρο"],
355                    ["file2", "field_a", "field_b", "field_c"],
356                    ["file2", "green", "13.5", "κόκκινος"],
357                    ["file2", "blue", "15", "πράσινος"]]);
358 
359     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
360                   [["file", "field_a", "field_b", "field_c"],
361                    ["file1", "red", "17", "κόκκινος"],
362                    ["file1", "blue", "12", "άσπρο"],
363                    ["file2", "green", "13.5", "κόκκινος"],
364                    ["file2", "blue", "15", "πράσινος"]]);
365 
366     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
367                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
368                   [["source", "field_a", "field_b", "field_c"],
369                    ["file1", "red", "17", "κόκκινος"],
370                    ["file1", "blue", "12", "άσπρο"],
371                    ["file2", "green", "13.5", "κόκκινος"],
372                    ["file2", "blue", "15", "πράσινος"],
373                    ["file3", "yellow", "9", "κίτρινος"]]);
374 
375     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
376                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
377                   [["id", "field_a", "field_b", "field_c"],
378                    ["1a", "red", "17", "κόκκινος"],
379                    ["1a", "blue", "12", "άσπρο"],
380                    ["1b", "green", "13.5", "κόκκινος"],
381                    ["1b", "blue", "15", "πράσινος"],
382                    ["1c", "yellow", "9", "κίτρινος"]]);
383 
384     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
385                    "-f", format("1b=%s", filepath2), filepath3],
386                   [["id", "field_a", "field_b", "field_c"],
387                    ["1a", "red", "17", "κόκκινος"],
388                    ["1a", "blue", "12", "άσπρο"],
389                    ["1b", "green", "13.5", "κόκκινος"],
390                    ["1b", "blue", "15", "πράσινος"],
391                    ["file3", "yellow", "9", "κίτρινος"]]);
392 }