tsv_join source code

1 /**
2 Command line tool that joins tab-separated value files based on a common key.
3 
4 This tool joins lines from tab-delimited files based on a common key. One file, the 'filter'
5 file, contains the records (lines) being matched. The other input files are searched for
6 matching records. Matching records are written to standard output, along with any designated
7 fields from the 'filter' file. In database parlance this is a 'hash semi-join'.
8 
9 Copyright (c) 2015-2018, eBay Software Foundation
10 Initially written by Jon Degenhardt
11 
12 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
13 */
14 module tsv_join;
15 
16 import std.stdio;
17 import std.format : format;
18 import std.typecons : tuple;
19 
20 auto helpText = q"EOS
21 Synopsis: tsv-join --filter-file file [options] file [file...]
22 
23 tsv-join matches input lines against lines from a 'filter' file. The match is
24 based on fields or the entire line. Use '--help-verbose' for more details.
25 
26 Options:
27 EOS";
28 
29 auto helpTextVerbose = q"EOS
30 Synopsis: tsv-join --filter-file file [options] file [file...]
31 
32 tsv-join matches input lines against lines from a 'filter' file. The match is
33 based on exact match comparison of one or more 'key' fields. Fields are TAB
34 delimited by default. Matching lines are written to standard output, along with
35 any additional fields from the key file that have been specified. An example:
36 
37   tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv
38 
39 This reads filter.tsv, creating a hash table keyed on field 1. Lines from data.tsv
40 are read one at a time. If field 1 is found in the hash table, the line is written
41 to standard output with fields 5 and 6 from the filter file appended. In database
42 parlance this is a "hash semi join". Note the asymmetric relationship: Records in
43 the filter file should be unique, but data.tsv lines can repeat.
44 
45 tsv-join can also work as a simple filter, this is the default behavior. Example:
46 
47   tsv-join --filter-file filter.tsv data.tsv
48 
49 This outputs all lines from data.tsv found in filter.tsv. --key-fields can still
50 be used to define the match key. The --exclude option can be used to exclude
51 matched lines rather than keep them.
52 
53 Multiple fields can be specified as keys and append fields. Field numbers start
54 at one, zero represents the whole line. Fields are comma separated and ranges
55 can be used. Example:
56 
57   tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv
58 
59 Options:
60 EOS";
61 
62 /** Container for command line options.
63  */
64 struct TsvJoinOptions
65 {
66     string programName;
67     string filterFile;               // --filter
68     size_t[] keyFields;              // --key-fields
69     size_t[] dataFields;             // --data-fields
70     size_t[] appendFields;           // --append-fields
71     bool hasHeader = false;          // --H|header
72     string appendHeaderPrefix = "";  // --append-header-prefix
73     bool writeAll = false;           // --write-all
74     string writeAllValue;            // --write-all
75     bool exclude = false;            // --exclude
76     char delim = '\t';               // --delimiter
77     bool helpVerbose = false;        // --help-verbose
78     bool versionWanted = false;      // --V|version
79     bool allowDupliateKeys = false;  // --allow-duplicate-keys
80     bool keyIsFullLine = false;      // Derived: --key-fields 0
81     bool dataIsFullLine = false;     // Derived: --data-fields 0
82     bool appendFullLine = false;     // Derived: --append-fields 0
83 
84     /* Returns a tuple. First value is true if command line arguments were successfully
85      * processed and execution should continue, or false if an error occurred or the user
86      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
87      *
88      * Returning true (execution continues) means args have been validated and derived
89      * values calculated. In addition, field indices have been converted to zero-based.
90      * If the whole line is the key, the individual fields lists will be cleared.
91      */
92     auto processArgs (ref string[] cmdArgs)
93     {
94         import std.algorithm : any, each;
95         import std.getopt;
96         import std.path : baseName, stripExtension;
97         import std.typecons : Yes, No;
98         import tsvutil :  makeFieldListOptionHandler;
99 
100         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
101 
102         /* Handler for --write-all. Special handler so two values can be set. */
103         void writeAllHandler(string option, string value)
104         {
105             debug stderr.writeln("[writeAllHandler] |", option, "|  |", value, "|");
106             writeAll = true;
107             writeAllValue = value;
108         }
109 
110         try
111         {
112             arraySep = ",";    // Use comma to separate values in command line options
113             auto r = getopt(
114                 cmdArgs,
115                 "help-verbose",    "              Print full help.", &helpVerbose,
116                 "f|filter-file",   "FILE          (Required) File with records to use as a filter.", &filterFile,
117 
118                 "k|key-fields",    "<field-list>  Fields to use as join key. Default: 0 (entire line).",
119                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
120 
121                 "d|data-fields",   "<field-list>  Data record fields to use as join key, if different than --key-fields.",
122                 dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
123 
124                 "a|append-fields", "<field-list>  Filter fields to append to matched records.",
125                 appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
126 
127                 std.getopt.config.caseSensitive,
128                 "H|header",        "              Treat the first line of each file as a header.", &hasHeader,
129                 std.getopt.config.caseInsensitive,
130                 "p|prefix",        "STR           String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix,
131                 "w|write-all",     "STR           Output all data records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler,
132                 "e|exclude",       "              Exclude matching records.", &exclude,
133                 "delimiter",       "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
134                 "z|allow-duplicate-keys",
135                                    "              Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys,
136                 std.getopt.config.caseSensitive,
137                 "V|version",       "              Print version information and exit.", &versionWanted,
138                 std.getopt.config.caseInsensitive,
139                 );
140 
141             if (r.helpWanted)
142             {
143                 defaultGetoptPrinter(helpText, r.options);
144                 return tuple(false, 0);
145             }
146             else if (helpVerbose)
147             {
148                 defaultGetoptPrinter(helpTextVerbose, r.options);
149                 return tuple(false, 0);
150             }
151             else if (versionWanted)
152             {
153                 import tsvutils_version;
154                 writeln(tsvutilsVersionNotice("tsv-join"));
155                 return tuple(false, 0);
156             }
157 
158             consistencyValidations(cmdArgs);
159             derivations();
160         }
161         catch (Exception exc)
162         {
163             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
164             return tuple(false, 1);
165         }
166         return tuple(true, 0);
167     }
168 
169     /* This routine does validations not handled by getopt, usually because they
170      * involve interactions between multiple parameters.
171      */
172     private void consistencyValidations(ref string[] processedCmdArgs)
173     {
174         import std.algorithm : any;
175 
176         if (filterFile.length == 0)
177         {
178             throw new Exception("Required option --filter-file was not supplied.");
179         }
180         else if (filterFile == "-" && processedCmdArgs.length == 1)
181         {
182             throw new Exception("A data file is required when standard input is used for the filter file (--f|filter-file -).");
183         }
184 
185         if (writeAll && appendFields.length == 0)
186         {
187             throw new Exception("Use --a|append-fields when using --w|write-all.");
188         }
189 
190         if (writeAll && appendFields.length == 1 && appendFields[0] == 0)
191         {
192             throw new Exception("Cannot use '--a|append-fields 0' (whole line) when using --w|write-all.");
193         }
194 
195         if (appendFields.length > 0 && exclude)
196         {
197             throw new Exception("--e|exclude cannot be used with --a|append-fields.");
198         }
199 
200         if (appendHeaderPrefix.length > 0 && !hasHeader)
201         {
202             throw new Exception("Use --header when using --p|prefix.");
203         }
204 
205         if (dataFields.length > 0 && keyFields.length != dataFields.length)
206         {
207             throw new Exception("Different number of --k|key-fields and --d|data-fields.");
208         }
209 
210         if (keyFields.length == 1 && dataFields.length == 1 &&
211             ((keyFields[0] == 0 && dataFields[0] != 0) || (keyFields[0] != 0 && dataFields[0] == 0)))
212         {
213             throw new Exception("If either --k|key-field or --d|data-field is zero both must be zero.");
214         }
215 
216         if ((keyFields.length > 1    && any!(a => a == 0)(keyFields)) ||
217             (dataFields.length > 1   && any!(a => a == 0)(dataFields)) ||
218             (appendFields.length > 1 && any!(a => a == 0)(appendFields)))
219         {
220             throw new Exception("Field 0 (whole line) cannot be combined with individual fields (non-zero).");
221         }
222 
223     }
224 
225     /* Post-processing derivations. */
226     void derivations()
227     {
228         import std.algorithm : each;
229         import std.range;
230 
231         // Convert 'full-line' field indexes (index zero) to boolean flags.
232         if (keyFields.length == 0)
233         {
234             assert(dataFields.length == 0);
235             keyIsFullLine = true;
236             dataIsFullLine = true;
237         }
238         else if (keyFields.length == 1 && keyFields[0] == 0)
239         {
240             keyIsFullLine = true;
241             keyFields.popFront;
242             dataIsFullLine = true;
243 
244             if (dataFields.length == 1)
245             {
246                 assert(dataFields[0] == 0);
247                 dataFields.popFront;
248             }
249         }
250 
251         if (appendFields.length == 1 && appendFields[0] == 0)
252         {
253             appendFullLine = true;
254             appendFields.popFront;
255         }
256 
257         assert(!(keyIsFullLine && keyFields.length > 0));
258         assert(!(dataIsFullLine && dataFields.length > 0));
259         assert(!(appendFullLine && appendFields.length > 0));
260 
261         // Switch to zero-based field indexes.
262         keyFields.each!((ref a) => --a);
263         dataFields.each!((ref a) => --a);
264         appendFields.each!((ref a) => --a);
265     }
266 }
267 
268 /** Main program.
269  */
270 int main(string[] cmdArgs)
271 {
272     /* When running in DMD code coverage mode, turn on report merging. */
273     version(D_Coverage) version(DigitalMars)
274     {
275         import core.runtime : dmd_coverSetMerge;
276         dmd_coverSetMerge(true);
277     }
278 
279     TsvJoinOptions cmdopt;
280     auto r = cmdopt.processArgs(cmdArgs);
281     if (!r[0]) return r[1];
282     try tsvJoin(cmdopt, cmdArgs[1..$]);
283     catch (Exception exc)
284     {
285         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
286         return 1;
287     }
288     return 0;
289 }
290 
291 /** tsvJoin does the primary work of the tsv-join program.
292  */
293 void tsvJoin(in TsvJoinOptions cmdopt, in string[] inputFiles)
294 {
295     import tsvutil : InputFieldReordering, BufferedOutputRange, throwIfWindowsNewlineOnUnix;
296     import std.algorithm : splitter;
297     import std.array : join;
298     import std.range;
299     import std.conv : to;
300 
301     /* State, variables, and convenience derivations.
302      *
303      * Combinations of individual fields and whole line (field zero) are convenient for the
304      * user, but create complexities for the program. Many combinations are disallowed by
305      * command line processing, but the remaining combos still leave several states. Also,
306      * this code optimizes by doing only necessary operations, further complicating state
307      * Here's a guide to variables and state.
308      * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys.
309      *      Empty if the  whole line is used as a key. Must be the same length.
310      * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key.
311      * - cmdopt.appendFields array - Indexes of individual filter file fields being appended.
312      *      Empty if appending the full line, or if not appending anything.
313      * - cmdopt.appendFullLine - True when the whole line is being appended.
314      * - isAppending - True is something is being appended.
315      * - cmdopt.writeAll - True if all lines are being written
316      */
317     /* Convenience derivations. */
318     auto numKeyFields = cmdopt.keyFields.length;
319     auto numAppendFields = cmdopt.appendFields.length;
320     bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0);
321 
322     /* Mappings from field indexes in the input lines to collection arrays. */
323     auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields);
324     auto dataKeysReordering = (cmdopt.dataFields.length == 0) ?
325         filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields);
326     auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields);
327 
328     /* The master filter hash. The key is the delimited fields concatenated together
329      * (including separators). The value is the appendFields concatenated together, as
330      * they will be appended to the input line. Both the keys and append fields are
331      * assembled in the order specified, though this only required for append fields.
332      */
333     string[string] filterHash;
334     string appendFieldsHeader;
335 
336     /* The append values for unmatched records. */
337     char[] appendFieldsUnmatchedValue;
338 
339     if (cmdopt.writeAll)
340     {
341         assert(cmdopt.appendFields.length > 0);  // Checked in consistencyValidations
342 
343         // reserve space for n values and n-1 delimiters
344         appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1);
345 
346         appendFieldsUnmatchedValue ~= cmdopt.writeAllValue;
347         for (size_t i = 1; i < cmdopt.appendFields.length; ++i)
348         {
349             appendFieldsUnmatchedValue ~= cmdopt.delim;
350             appendFieldsUnmatchedValue ~= cmdopt.writeAllValue;
351         }
352     }
353 
354     /* Read the filter file. */
355     {
356         bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0);
357         auto filterStream = (cmdopt.filterFile == "-") ? stdin : cmdopt.filterFile.File;
358         foreach (lineNum, line; filterStream.byLine.enumerate(1))
359         {
360             debug writeln("[filter line] |", line, "|");
361             if (needPerFieldProcessing)
362             {
363                 filterKeysReordering.initNewLine;
364                 appendFieldsReordering.initNewLine;
365 
366                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
367                 {
368                     filterKeysReordering.processNextField(fieldIndex,fieldValue);
369                     appendFieldsReordering.processNextField(fieldIndex,fieldValue);
370 
371                     if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled)
372                     {
373                         break;
374                     }
375                 }
376                 // Processed all fields in the line.
377                 if (!filterKeysReordering.allFieldsFilled || !appendFieldsReordering.allFieldsFilled)
378                 {
379                     throw new Exception(
380                         format("Not enough fields in line. File: %s, Line: %s",
381                                (cmdopt.filterFile == "-") ? "Standard Input" : cmdopt.filterFile, lineNum));
382                 }
383             }
384 
385             string key = cmdopt.keyIsFullLine ?
386                 line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string;
387             string appendValues = cmdopt.appendFullLine ?
388                 line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string;
389 
390             debug writeln("  --> [key]:[append] => [", key, "]:[", appendValues, "]");
391 
392             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, cmdopt.filterFile, lineNum);
393 
394             if (lineNum == 1 && cmdopt.hasHeader)
395             {
396                 if (cmdopt.appendHeaderPrefix.length == 0)
397                 {
398                     appendFieldsHeader = appendValues;
399                 }
400                 else
401                 {
402                     foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate)
403                     {
404                         if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim;
405                         appendFieldsHeader ~= cmdopt.appendHeaderPrefix;
406                         appendFieldsHeader ~= fieldValue;
407                     }
408                 }
409             }
410             else
411             {
412                 if (isAppending && !cmdopt.allowDupliateKeys)
413                 {
414                     string* currAppendValues = (key in filterHash);
415                     if (currAppendValues !is null && *currAppendValues != appendValues)
416                     {
417                         throw new Exception(
418                             format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n   [key 1][values]: [%s][%s]\n   [key 2][values]: [%s][%s]",
419                                    key, *currAppendValues, key, appendValues));
420                     }
421                 }
422                 filterHash[key] = appendValues;
423             }
424         }
425     }
426 
427     filterHash.rehash;    // For faster lookups. (Per docs. In my tests no performance delta.)
428 
429     /* Now process each input file, one line at a time. */
430 
431     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
432     bool headerWritten = false;
433 
434     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
435     {
436         auto inputStream = (filename == "-") ? stdin : filename.File();
437         foreach (lineNum, line; inputStream.byLine.enumerate(1))
438         {
439             debug writeln("[input line] |", line, "|");
440 
441             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
442 
443             if (lineNum == 1 && cmdopt.hasHeader)
444             {
445                 /* Header line processing. */
446                 if (!headerWritten)
447                 {
448                     bufferedOutput.append(line);
449                     if (isAppending)
450                     {
451                         bufferedOutput.append(cmdopt.delim);
452                         bufferedOutput.append(appendFieldsHeader);
453                     }
454                     bufferedOutput.appendln;
455                     headerWritten = true;
456                 }
457             }
458             else
459             {
460                 /* Regular line (not a header line).
461                  *
462                  * Next block checks if the input line matches a hash entry. Two cases:
463                  *   a) The whole line is the key. Simply look it up in the hash.
464                  *   b) Individual fields are used as the key - Assemble key and look it up.
465                  *
466                  * At the end of the appendFields will contain the result of hash lookup.
467                  */
468                 string* appendFields;
469                 if (cmdopt.keyIsFullLine)
470                 {
471                     appendFields = (line in filterHash);
472                 }
473                 else
474                 {
475                     dataKeysReordering.initNewLine;
476                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
477                     {
478                         dataKeysReordering.processNextField(fieldIndex, fieldValue);
479                         if (dataKeysReordering.allFieldsFilled) break;
480                     }
481                     // Processed all fields in the line.
482                     if (!dataKeysReordering.allFieldsFilled)
483                     {
484                         throw new Exception(
485                             format("Not enough fields in line. File: %s, Line: %s",
486                                    (filename == "-") ? "Standard Input" : filename, lineNum));
487                     }
488                     appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash);
489                 }
490 
491                 bool matched = (appendFields !is null);
492                 debug writeln("   --> matched? ", matched);
493                 if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude))
494                 {
495                     bufferedOutput.append(line);
496                     if (isAppending)
497                     {
498                         bufferedOutput.append(cmdopt.delim);
499                         bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue);
500                     }
501                     bufferedOutput.appendln();
502                 }
503             }
504         }
505     }
506 }