tsv_utils.tsv_join source code

1 /**
2 Command line tool that joins tab-separated value files based on a common key.
3 
4 This tool joins lines from tab-delimited files based on a common key. One file, the 'filter'
5 file, contains the records (lines) being matched. The other input files are searched for
6 matching records. Matching records are written to standard output, along with any designated
7 fields from the 'filter' file. In database parlance this is a 'hash semi-join'.
8 
9 Copyright (c) 2015-2019, eBay Software Foundation
10 Initially written by Jon Degenhardt
11 
12 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
13 */
14 module tsv_utils.tsv_join;
15 
16 import std.stdio;
17 import std.format : format;
18 import std.typecons : tuple;
19 
20 auto helpText = q"EOS
21 Synopsis: tsv-join --filter-file file [options] file [file...]
22 
23 tsv-join matches input lines against lines from a 'filter' file. The match is
24 based on fields or the entire line. Use '--help-verbose' for more details.
25 
26 Options:
27 EOS";
28 
29 auto helpTextVerbose = q"EOS
30 Synopsis: tsv-join --filter-file file [options] file [file...]
31 
32 tsv-join matches input lines against lines from a 'filter' file. The match is
33 based on exact match comparison of one or more 'key' fields. Fields are TAB
34 delimited by default. Matching lines are written to standard output, along with
35 any additional fields from the key file that have been specified. An example:
36 
37   tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv
38 
39 This reads filter.tsv, creating a hash table keyed on field 1. Lines from data.tsv
40 are read one at a time. If field 1 is found in the hash table, the line is written
41 to standard output with fields 5 and 6 from the filter file appended. In database
42 parlance this is a "hash semi join". Note the asymmetric relationship: Records in
43 the filter file should be unique, but data.tsv lines can repeat.
44 
45 tsv-join can also work as a simple filter, this is the default behavior. Example:
46 
47   tsv-join --filter-file filter.tsv data.tsv
48 
49 This outputs all lines from data.tsv found in filter.tsv. --key-fields can still
50 be used to define the match key. The --exclude option can be used to exclude
51 matched lines rather than keep them.
52 
53 Multiple fields can be specified as keys and append fields. Field numbers start
54 at one, zero represents the whole line. Fields are comma separated and ranges
55 can be used. Example:
56 
57   tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv
58 
59 Options:
60 EOS";
61 
62 /** Container for command line options.
63  */
64 struct TsvJoinOptions
65 {
66     string programName;
67     string filterFile;               // --filter
68     size_t[] keyFields;              // --key-fields
69     size_t[] dataFields;             // --data-fields
70     size_t[] appendFields;           // --append-fields
71     bool hasHeader = false;          // --H|header
72     string appendHeaderPrefix = "";  // --append-header-prefix
73     bool writeAll = false;           // --write-all
74     string writeAllValue;            // --write-all
75     bool exclude = false;            // --exclude
76     char delim = '\t';               // --delimiter
77     bool helpVerbose = false;        // --help-verbose
78     bool versionWanted = false;      // --V|version
79     bool allowDupliateKeys = false;  // --allow-duplicate-keys
80     bool keyIsFullLine = false;      // Derived: --key-fields 0
81     bool dataIsFullLine = false;     // Derived: --data-fields 0
82     bool appendFullLine = false;     // Derived: --append-fields 0
83 
84     /* Returns a tuple. First value is true if command line arguments were successfully
85      * processed and execution should continue, or false if an error occurred or the user
86      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
87      *
88      * Returning true (execution continues) means args have been validated and derived
89      * values calculated. In addition, field indices have been converted to zero-based.
90      * If the whole line is the key, the individual fields lists will be cleared.
91      */
92     auto processArgs (ref string[] cmdArgs)
93     {
94         import std.algorithm : any, each;
95         import std.getopt;
96         import std.path : baseName, stripExtension;
97         import std.typecons : Yes, No;
98         import tsv_utils.common.utils :  makeFieldListOptionHandler;
99 
100         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
101 
102         /* Handler for --write-all. Special handler so two values can be set. */
103         void writeAllHandler(string option, string value)
104         {
105             debug stderr.writeln("[writeAllHandler] |", option, "|  |", value, "|");
106             writeAll = true;
107             writeAllValue = value;
108         }
109 
110         try
111         {
112             arraySep = ",";    // Use comma to separate values in command line options
113             auto r = getopt(
114                 cmdArgs,
115                 "help-verbose",    "              Print full help.", &helpVerbose,
116                 "f|filter-file",   "FILE          (Required) File with records to use as a filter.", &filterFile,
117 
118                 "k|key-fields",    "<field-list>  Fields to use as join key. Default: 0 (entire line).",
119                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
120 
121                 "d|data-fields",   "<field-list>  Data record fields to use as join key, if different than --key-fields.",
122                 dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
123 
124                 "a|append-fields", "<field-list>  Filter fields to append to matched records.",
125                 appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
126 
127                 std.getopt.config.caseSensitive,
128                 "H|header",        "              Treat the first line of each file as a header.", &hasHeader,
129                 std.getopt.config.caseInsensitive,
130                 "p|prefix",        "STR           String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix,
131                 "w|write-all",     "STR           Output all data records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler,
132                 "e|exclude",       "              Exclude matching records.", &exclude,
133                 "delimiter",       "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
134                 "z|allow-duplicate-keys",
135                                    "              Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys,
136                 std.getopt.config.caseSensitive,
137                 "V|version",       "              Print version information and exit.", &versionWanted,
138                 std.getopt.config.caseInsensitive,
139                 );
140 
141             if (r.helpWanted)
142             {
143                 defaultGetoptPrinter(helpText, r.options);
144                 return tuple(false, 0);
145             }
146             else if (helpVerbose)
147             {
148                 defaultGetoptPrinter(helpTextVerbose, r.options);
149                 return tuple(false, 0);
150             }
151             else if (versionWanted)
152             {
153                 import tsv_utils.common.tsvutils_version;
154                 writeln(tsvutilsVersionNotice("tsv-join"));
155                 return tuple(false, 0);
156             }
157 
158             consistencyValidations(cmdArgs);
159             derivations();
160         }
161         catch (Exception exc)
162         {
163             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
164             return tuple(false, 1);
165         }
166         return tuple(true, 0);
167     }
168 
169     /* This routine does validations not handled by getopt, usually because they
170      * involve interactions between multiple parameters.
171      */
172     private void consistencyValidations(ref string[] processedCmdArgs)
173     {
174         import std.algorithm : any;
175 
176         if (filterFile.length == 0)
177         {
178             throw new Exception("Required option --filter-file was not supplied.");
179         }
180         else if (filterFile == "-" && processedCmdArgs.length == 1)
181         {
182             throw new Exception("A data file is required when standard input is used for the filter file (--f|filter-file -).");
183         }
184 
185         if (writeAll && appendFields.length == 0)
186         {
187             throw new Exception("Use --a|append-fields when using --w|write-all.");
188         }
189 
190         if (writeAll && appendFields.length == 1 && appendFields[0] == 0)
191         {
192             throw new Exception("Cannot use '--a|append-fields 0' (whole line) when using --w|write-all.");
193         }
194 
195         if (appendFields.length > 0 && exclude)
196         {
197             throw new Exception("--e|exclude cannot be used with --a|append-fields.");
198         }
199 
200         if (appendHeaderPrefix.length > 0 && !hasHeader)
201         {
202             throw new Exception("Use --header when using --p|prefix.");
203         }
204 
205         if (dataFields.length > 0 && keyFields.length != dataFields.length)
206         {
207             throw new Exception("Different number of --k|key-fields and --d|data-fields.");
208         }
209 
210         if (keyFields.length == 1 && dataFields.length == 1 &&
211             ((keyFields[0] == 0 && dataFields[0] != 0) || (keyFields[0] != 0 && dataFields[0] == 0)))
212         {
213             throw new Exception("If either --k|key-field or --d|data-field is zero both must be zero.");
214         }
215 
216         if ((keyFields.length > 1    && any!(a => a == 0)(keyFields)) ||
217             (dataFields.length > 1   && any!(a => a == 0)(dataFields)) ||
218             (appendFields.length > 1 && any!(a => a == 0)(appendFields)))
219         {
220             throw new Exception("Field 0 (whole line) cannot be combined with individual fields (non-zero).");
221         }
222 
223     }
224 
225     /* Post-processing derivations. */
226     void derivations()
227     {
228         import std.algorithm : each;
229         import std.range;
230 
231         // Convert 'full-line' field indexes (index zero) to boolean flags.
232         if (keyFields.length == 0)
233         {
234             assert(dataFields.length == 0);
235             keyIsFullLine = true;
236             dataIsFullLine = true;
237         }
238         else if (keyFields.length == 1 && keyFields[0] == 0)
239         {
240             keyIsFullLine = true;
241             keyFields.popFront;
242             dataIsFullLine = true;
243 
244             if (dataFields.length == 1)
245             {
246                 assert(dataFields[0] == 0);
247                 dataFields.popFront;
248             }
249         }
250 
251         if (appendFields.length == 1 && appendFields[0] == 0)
252         {
253             appendFullLine = true;
254             appendFields.popFront;
255         }
256 
257         assert(!(keyIsFullLine && keyFields.length > 0));
258         assert(!(dataIsFullLine && dataFields.length > 0));
259         assert(!(appendFullLine && appendFields.length > 0));
260 
261         // Switch to zero-based field indexes.
262         keyFields.each!((ref a) => --a);
263         dataFields.each!((ref a) => --a);
264         appendFields.each!((ref a) => --a);
265     }
266 }
267 
268 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
269 
270 /** Main program.
271  */
272 int main(string[] cmdArgs)
273 {
274     /* When running in DMD code coverage mode, turn on report merging. */
275     version(D_Coverage) version(DigitalMars)
276     {
277         import core.runtime : dmd_coverSetMerge;
278         dmd_coverSetMerge(true);
279     }
280 
281     TsvJoinOptions cmdopt;
282     auto r = cmdopt.processArgs(cmdArgs);
283     if (!r[0]) return r[1];
284     try tsvJoin(cmdopt, cmdArgs[1..$]);
285     catch (Exception exc)
286     {
287         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
288         return 1;
289     }
290     return 0;
291 }
292 
293 /** tsvJoin does the primary work of the tsv-join program.
294  */
295 void tsvJoin(in TsvJoinOptions cmdopt, in string[] inputFiles)
296 {
297     import tsv_utils.common.utils : InputFieldReordering, bufferedByLine, BufferedOutputRange, throwIfWindowsNewlineOnUnix;
298     import std.algorithm : splitter;
299     import std.array : join;
300     import std.range;
301     import std.conv : to;
302 
303     /* State, variables, and convenience derivations.
304      *
305      * Combinations of individual fields and whole line (field zero) are convenient for the
306      * user, but create complexities for the program. Many combinations are disallowed by
307      * command line processing, but the remaining combos still leave several states. Also,
308      * this code optimizes by doing only necessary operations, further complicating state
309      * Here's a guide to variables and state.
310      * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys.
311      *      Empty if the  whole line is used as a key. Must be the same length.
312      * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key.
313      * - cmdopt.appendFields array - Indexes of individual filter file fields being appended.
314      *      Empty if appending the full line, or if not appending anything.
315      * - cmdopt.appendFullLine - True when the whole line is being appended.
316      * - isAppending - True is something is being appended.
317      * - cmdopt.writeAll - True if all lines are being written
318      */
319     /* Convenience derivations. */
320     auto numKeyFields = cmdopt.keyFields.length;
321     auto numAppendFields = cmdopt.appendFields.length;
322     bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0);
323 
324     /* Mappings from field indexes in the input lines to collection arrays. */
325     auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields);
326     auto dataKeysReordering = (cmdopt.dataFields.length == 0) ?
327         filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields);
328     auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields);
329 
330     /* The master filter hash. The key is the delimited fields concatenated together
331      * (including separators). The value is the appendFields concatenated together, as
332      * they will be appended to the input line. Both the keys and append fields are
333      * assembled in the order specified, though this only required for append fields.
334      */
335     string[string] filterHash;
336     string appendFieldsHeader;
337 
338     /* The append values for unmatched records. */
339     char[] appendFieldsUnmatchedValue;
340 
341     if (cmdopt.writeAll)
342     {
343         assert(cmdopt.appendFields.length > 0);  // Checked in consistencyValidations
344 
345         // reserve space for n values and n-1 delimiters
346         appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1);
347 
348         appendFieldsUnmatchedValue ~= cmdopt.writeAllValue;
349         for (size_t i = 1; i < cmdopt.appendFields.length; ++i)
350         {
351             appendFieldsUnmatchedValue ~= cmdopt.delim;
352             appendFieldsUnmatchedValue ~= cmdopt.writeAllValue;
353         }
354     }
355 
356     /* Read the filter file. */
357     {
358         bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0);
359         auto filterStream = (cmdopt.filterFile == "-") ? stdin : cmdopt.filterFile.File;
360         foreach (lineNum, line; filterStream.bufferedByLine.enumerate(1))
361         {
362             debug writeln("[filter line] |", line, "|");
363             if (needPerFieldProcessing)
364             {
365                 filterKeysReordering.initNewLine;
366                 appendFieldsReordering.initNewLine;
367 
368                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
369                 {
370                     filterKeysReordering.processNextField(fieldIndex,fieldValue);
371                     appendFieldsReordering.processNextField(fieldIndex,fieldValue);
372 
373                     if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled)
374                     {
375                         break;
376                     }
377                 }
378                 // Processed all fields in the line.
379                 if (!filterKeysReordering.allFieldsFilled || !appendFieldsReordering.allFieldsFilled)
380                 {
381                     throw new Exception(
382                         format("Not enough fields in line. File: %s, Line: %s",
383                                (cmdopt.filterFile == "-") ? "Standard Input" : cmdopt.filterFile, lineNum));
384                 }
385             }
386 
387             string key = cmdopt.keyIsFullLine ?
388                 line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string;
389             string appendValues = cmdopt.appendFullLine ?
390                 line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string;
391 
392             debug writeln("  --> [key]:[append] => [", key, "]:[", appendValues, "]");
393 
394             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, cmdopt.filterFile, lineNum);
395 
396             if (lineNum == 1 && cmdopt.hasHeader)
397             {
398                 if (cmdopt.appendHeaderPrefix.length == 0)
399                 {
400                     appendFieldsHeader = appendValues;
401                 }
402                 else
403                 {
404                     foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate)
405                     {
406                         if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim;
407                         appendFieldsHeader ~= cmdopt.appendHeaderPrefix;
408                         appendFieldsHeader ~= fieldValue;
409                     }
410                 }
411             }
412             else
413             {
414                 if (isAppending && !cmdopt.allowDupliateKeys)
415                 {
416                     string* currAppendValues = (key in filterHash);
417                     if (currAppendValues !is null && *currAppendValues != appendValues)
418                     {
419                         throw new Exception(
420                             format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n   [key 1][values]: [%s][%s]\n   [key 2][values]: [%s][%s]",
421                                    key, *currAppendValues, key, appendValues));
422                     }
423                 }
424                 filterHash[key] = appendValues;
425             }
426         }
427     }
428 
429     filterHash.rehash;    // For faster lookups. (Per docs. In my tests no performance delta.)
430 
431     /* Now process each input file, one line at a time. */
432 
433     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
434     bool headerWritten = false;
435 
436     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
437     {
438         auto inputStream = (filename == "-") ? stdin : filename.File();
439         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
440         {
441             debug writeln("[input line] |", line, "|");
442 
443             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
444 
445             if (lineNum == 1 && cmdopt.hasHeader)
446             {
447                 /* Header line processing. */
448                 if (!headerWritten)
449                 {
450                     bufferedOutput.append(line);
451                     if (isAppending)
452                     {
453                         bufferedOutput.append(cmdopt.delim);
454                         bufferedOutput.append(appendFieldsHeader);
455                     }
456                     bufferedOutput.appendln;
457                     headerWritten = true;
458                 }
459             }
460             else
461             {
462                 /* Regular line (not a header line).
463                  *
464                  * Next block checks if the input line matches a hash entry. Two cases:
465                  *   a) The whole line is the key. Simply look it up in the hash.
466                  *   b) Individual fields are used as the key - Assemble key and look it up.
467                  *
468                  * At the end of the appendFields will contain the result of hash lookup.
469                  */
470                 string* appendFields;
471                 if (cmdopt.keyIsFullLine)
472                 {
473                     appendFields = (line in filterHash);
474                 }
475                 else
476                 {
477                     dataKeysReordering.initNewLine;
478                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
479                     {
480                         dataKeysReordering.processNextField(fieldIndex, fieldValue);
481                         if (dataKeysReordering.allFieldsFilled) break;
482                     }
483                     // Processed all fields in the line.
484                     if (!dataKeysReordering.allFieldsFilled)
485                     {
486                         throw new Exception(
487                             format("Not enough fields in line. File: %s, Line: %s",
488                                    (filename == "-") ? "Standard Input" : filename, lineNum));
489                     }
490                     appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash);
491                 }
492 
493                 bool matched = (appendFields !is null);
494                 debug writeln("   --> matched? ", matched);
495                 if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude))
496                 {
497                     bufferedOutput.append(line);
498                     if (isAppending)
499                     {
500                         bufferedOutput.append(cmdopt.delim);
501                         bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue);
502                     }
503                     bufferedOutput.appendln();
504                 }
505             }
506         }
507     }
508 }