tsv_utils.tsv_uniq source code

1 /**
2 Command line tool that identifies equivalent lines in an input stream. Equivalent
3 lines are identified using either the full line or a set of fields as the key. By
4 default, input is written to standard output, retaining only the first occurrence of
5 equivalent lines. There are also options for marking and numbering equivalent lines
6 rather, without filtering out duplicates.
7 
8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences.
9 First, the key can be composed of individual fields, not just the full line. Second,
10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when
11 they are adjacent, hence the usual need for sorting.)
12 
13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with
14 an equivalence ID, which is a one-upped counter. The other is to number lines, with
15 each unique key have its own set of numbers.
16 
17 Copyright (c) 2015-2020, eBay Inc.
18 Initially written by Jon Degenhardt
19 
20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
21 */
22 module tsv_utils.tsv_uniq;
23 
24 import std.exception : enforce;
25 import std.format : format;
26 import std.range;
27 import std.stdio;
28 import std.typecons : tuple;
29 
30 auto helpText = q"EOS
31 Synopsis: tsv-uniq [options] [file...]
32 
33 tsv-uniq filters out duplicate lines using fields as a key. Filtering is
34 based on the entire line when key fields are not provided. Options are
35 also available for assigning a unique id to each key and numbering the
36 occurrences of each key. Fields are specified using field number or field
37 name. Field names can be used when the input file has a header line.
38 
39 Use '--help-verbose' for more details.
40 Options:
41 EOS";
42 
43 auto helpTextVerbose = q"EOS
44 Synopsis: tsv-uniq [options] [file...]
45 
46 tsv-uniq identifies equivalent lines in tab-separated value files. Input
47 is read line by line, recording a key for each line based on one or more
48 of the fields. Two lines are equivalent if they have the same key. The
49 first time a key is seen its line is written to standard output.
50 Subsequent lines containing the same key are discarded. This command
51 uniq's a file on fields 2 and 3:
52 
53    tsv-uniq -f 2,3 file.tsv
54 
55 This is similar to the Unix 'uniq' program, but based on individual
56 fields and without requiring sorted data.
57 
58 Field names can be used if the input file has a header line. This command
59 uniq's a file based on the 'time' and 'date' fields:
60 
61    tsv-uniq -H -f time,date file.tsv
62 
63 Use '--help-fields' for details about field names.
64 
65 tsv-uniq can be run without specifying a key field. In this case the
66 whole line is used as a key, same as the Unix 'uniq' program. This works
67 on any line-oriented text file, not just TSV files.
68 
69 The above is the default behavior ('uniq' mode). The alternates to 'uniq'
70 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all
71 lines are written to standard output, but with a field appended marking
72 equivalent entries with an ID. The ID is a one-upped counter. Example:
73 
74    tsv-uniq --header -f 2,3 --equiv file.tsv
75 
76 'Number' mode also writes all lines to standard output, but with a field
77 appended numbering the occurrence count for the line's key. The first line
78 with a specific key is assigned the number '1', the second with the key is
79 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined.
80 
81 The '--r|repeated' option can be used to print only lines occurring more
82 than once. Specifically, the second occurrence of a key is printed. The
83 '--a|at-least N' option is similar, printing lines occurring at least N
84 times. (Like repeated, the Nth line with the key is printed.)
85 
86 The '--m|max MAX' option changes the behavior to output the first MAX
87 lines for each key, rather than just the first line for each key.
88 
89 If both '--a|at-least' and '--m|max' are specified, the occurrences
90 starting with 'at-least' and ending with 'max' are output.
91 
92 Options:
93 EOS";
94 
95 /** Container for command line options.
96  */
97 struct TsvUniqOptions
98 {
99     import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
100 
101     enum defaultEquivHeader = "equiv_id";
102     enum defaultEquivStartID = 1;
103     enum defaultNumberHeader = "equiv_line";
104 
105     string programName;
106     InputSourceRange inputSources;            /// Input files
107     size_t[] fields;                          /// Derived: --f|fields
108     bool hasHeader = false;                   /// --H|header
109     bool onlyRepeated = false;                /// --r|repeated. Shorthand for '--atleast 2'
110     size_t atLeast = 0;                       /// --a|at-least. Zero implies default behavior.
111     size_t max = 0;                           /// --m|max. Zero implies default behavior.
112     bool numberMode = false;                  /// --z|number
113     string numberHeader = defaultNumberHeader;  /// --number-header
114     bool equivMode = false;                   /// --e|equiv
115     string equivHeader = defaultEquivHeader;  /// --equiv-header
116     long equivStartID = defaultEquivStartID;  /// --equiv-start
117     bool ignoreCase = false;                  /// --i|ignore-case
118     char delim = '\t';                        /// --d|delimiter
119     bool keyIsFullLine = false;               /// Derived. True if no fields specified or '--f|fields 0'
120 
121     /* Returns a tuple. First value is true if command line arguments were successfully
122      * processed and execution should continue, or false if an error occurred or the user
123      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
124      *
125      * Returning true (execution continues) means args have been validated and derived
126      * values calculated. In addition, field indices have been converted to zero-based.
127      * If the whole line is the key, the individual fields list will be cleared.
128      *
129      * Repeat count control variables 'atLeast' and max' - These values are left at zero
130      * if no repeat count options are specified. They are set if repeat count options
131      * are specified, as follows:
132      *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
133      *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
134      *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
135      *    'atLeast' if --r|repeated or --a|at-least is provided.
136      *
137      * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
138      * represents the default "output all values" case. In this case max may be less
139      * than the at-least value.
140      */
141     auto processArgs (ref string[] cmdArgs)
142     {
143         import std.algorithm : all, each;
144         import std.conv : to;
145         import std.getopt;
146         import std.path : baseName, stripExtension;
147         import std.typecons : Yes, No;
148         import tsv_utils.common.fieldlist;
149         import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
150 
151         bool helpVerbose = false;         // --h|help-verbose
152         bool helpFields = false;          // --help-fields
153         bool versionWanted = false;       // --V|version
154         string fieldsArg;                 // --f|fields
155 
156         string fieldsOptionString = "f|fields";
157 
158         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
159 
160         try
161         {
162             arraySep = ",";    // Use comma to separate values in command line options
163             auto r = getopt(
164                 cmdArgs,
165                 "help-verbose",  "              Print full help.", &helpVerbose,
166                 "help-fields",   "              Print help on specifying fields.", &helpFields,
167                 std.getopt.config.caseSensitive,
168                 "V|version",     "              Print version information and exit.", &versionWanted,
169                 "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
170                 std.getopt.config.caseInsensitive,
171 
172                 fieldsOptionString,      "<field-list>  Fields to use as the key. Default: 0 (entire line).", &fieldsArg,
173 
174                 "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
175                 "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
176                 "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
177                 "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
178                 "e|equiv",       "              Output equivalence class IDs rather than uniq'ing entries.", &equivMode,
179                 "equiv-header",  "STR           Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader,
180                 "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
181                 "z|number",      "              Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode,
182                 "number-header", "STR           Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader,
183                 "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
184             );
185 
186             if (r.helpWanted)
187             {
188                 defaultGetoptPrinter(helpText, r.options);
189                 return tuple(false, 0);
190             }
191             else if (helpVerbose)
192             {
193                 defaultGetoptPrinter(helpTextVerbose, r.options);
194                 return tuple(false, 0);
195             }
196             else if (helpFields)
197             {
198                 writeln(fieldListHelpText);
199                 return tuple(false, 0);
200             }
201             else if (versionWanted)
202             {
203                 import tsv_utils.common.tsvutils_version;
204                 writeln(tsvutilsVersionNotice("tsv-uniq"));
205                 return tuple(false, 0);
206             }
207 
208             /* Input files. Remaining command line args are files. */
209             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
210             cmdArgs.length = 1;
211 
212             /* Validation - Do as much validation prior to header line processing as
213              * possible (avoids waiting on stdin).
214              */
215             if (!equivMode)
216             {
217                 enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv");
218                 enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv");
219             }
220 
221             enforce(numberMode || numberHeader == defaultNumberHeader,
222                     "--number-header requires --z|number");
223 
224             string[] headerFields;
225 
226             /* fieldListArgProcessing encapsulates the field list processing. It is
227              * called prior to reading the header line if headers are not being used,
228              * and after if headers are being used.
229              */
230             void fieldListArgProcessing()
231             {
232                 if (!fieldsArg.empty)
233                 {
234                     fields =
235                         fieldsArg
236                         .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
237                         (hasHeader, headerFields, fieldsOptionString)
238                         .array;
239                 }
240 
241                 enforce(fields.length <= 1 || fields.all!(x => x != 0),
242                         "Whole line as key (--f|field 0) cannot be combined with multiple fields.");
243 
244                 if (fields.length == 0)
245                 {
246                     keyIsFullLine = true;
247                 }
248                 else if (fields.length == 1 && fields[0] == 0)
249                 {
250                     keyIsFullLine = true;
251                     fields.length = 0;
252                 }
253 
254                 if (onlyRepeated && atLeast <= 1) atLeast = 2;
255                 if (atLeast >= 2 && max < atLeast)
256                 {
257                     // Don't modify max if it is zero and equivMode or numberMode is in effect.
258                     if (max != 0 || (!equivMode && !numberMode)) max = atLeast;
259                 }
260 
261                 if (!keyIsFullLine) fields.each!((ref x) => --x);  // Convert to 0-based indexing.
262             }
263 
264             if (!hasHeader) fieldListArgProcessing();
265 
266             /*
267              * Create the inputSourceRange and perform header line processing.
268              */
269             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
270             inputSources = inputSourceRange(filepaths, readHeader);
271 
272             if (hasHeader)
273             {
274                 throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1);
275                 headerFields = inputSources.front.header.split(delim).to!(string[]);
276                 fieldListArgProcessing();
277             }
278         }
279         catch (Exception exc)
280         {
281             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
282             return tuple(false, 1);
283         }
284         return tuple(true, 0);
285     }
286 }
287 
288 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
289 
290 /** Main program. Processes command line arguments and calls tsvUniq which implements
291  * the main processing logic.
292  */
293 int main(string[] cmdArgs)
294 {
295     /* When running in DMD code coverage mode, turn on report merging. */
296     version(D_Coverage) version(DigitalMars)
297     {
298         import core.runtime : dmd_coverSetMerge;
299         dmd_coverSetMerge(true);
300     }
301 
302     TsvUniqOptions cmdopt;
303     auto r = cmdopt.processArgs(cmdArgs);
304     if (!r[0]) return r[1];
305 
306     version(LDC_Profile)
307     {
308         import ldc.profile : resetAll;
309         resetAll();
310     }
311 
312     try tsvUniq(cmdopt);
313     catch (Exception exc)
314     {
315         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
316         return 1;
317     }
318     return 0;
319 }
320 
321 /** Outputs the unique lines from all the input files.
322  *
323  * Processes the lines in each input file. All lines are added to an associated array.
324  * The first time a line is seen it is output. If key fields are being used these are
325  * used as the basis for the associative array entries rather than the full line.
326  */
327 void tsvUniq(ref TsvUniqOptions cmdopt)
328 {
329     import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange,
330         InputFieldReordering, InputSourceRange, joinAppend, throwIfWindowsNewlineOnUnix;
331     import std.algorithm : splitter;
332     import std.array : appender;
333     import std.conv : to;
334     import std.uni : asLowerCase;
335     import std.utf : byChar;
336 
337     /* inputSources must be an InputSourceRange and include at least stdin. */
338     assert(!cmdopt.inputSources.empty);
339     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
340 
341     /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
342     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields);
343 
344     /* BufferedOutputRange is a performance enhancement for writing to stdout. */
345     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
346 
347     /* The master hash. The key is the specified fields concatenated together (including
348      * separators). The value is a struct with the equiv-id and occurrence count.
349      */
350     static struct EquivEntry { size_t equivID; size_t count; }
351     EquivEntry[string] equivHash;
352 
353     /* Reusable buffers for multi-field keys and case-insensitive keys. */
354     auto multiFieldKeyBuffer = appender!(char[]);
355     auto lowerKeyBuffer = appender!(char[]);
356 
357     const size_t numKeyFields = cmdopt.fields.length;
358     long nextEquivID = cmdopt.equivStartID;
359 
360     /* First header is read during command line arg processing. Flush it immediately
361      * so subsequent processes in a unix command pipeline see it early. This helps
362      * provide timely error messages.
363      */
364     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
365     {
366         auto inputStream = cmdopt.inputSources.front;
367 
368         bufferedOutput.append(inputStream.header);
369 
370         if (cmdopt.equivMode)
371         {
372             bufferedOutput.append(cmdopt.delim);
373             bufferedOutput.append(cmdopt.equivHeader);
374         }
375 
376         if (cmdopt.numberMode)
377         {
378             bufferedOutput.append(cmdopt.delim);
379             bufferedOutput.append(cmdopt.numberHeader);
380         }
381 
382         bufferedOutput.appendln();
383         bufferedOutput.flush();
384     }
385 
386     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
387 
388     foreach (inputStream; cmdopt.inputSources)
389     {
390         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
391 
392         foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
393         {
394             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
395 
396             /* Start by finding the key. */
397             typeof(line) key;
398             if (cmdopt.keyIsFullLine)
399             {
400                 key = line;
401             }
402             else
403             {
404                 assert(keyFieldsReordering !is null);
405 
406                 /* Copy the key fields to a new buffer. */
407                 keyFieldsReordering.initNewLine;
408                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
409                 {
410                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
411                     if (keyFieldsReordering.allFieldsFilled) break;
412                 }
413 
414                 enforce(keyFieldsReordering.allFieldsFilled,
415                         format("Not enough fields in line. File: %s, Line: %s",
416                                inputStream.name, lineNum));
417 
418                 if (numKeyFields == 1)
419                 {
420                     key = keyFieldsReordering.outputFields[0];
421                 }
422                 else
423                 {
424                     multiFieldKeyBuffer.clear();
425                     keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim);
426                     key = multiFieldKeyBuffer.data;
427                 }
428             }
429 
430             if (cmdopt.ignoreCase)
431             {
432                 /* Equivalent to key = key.toLower, but without memory allocation. */
433                 lowerKeyBuffer.clear();
434                 lowerKeyBuffer.put(key.asLowerCase.byChar);
435                 key = lowerKeyBuffer.data;
436             }
437 
438             bool isOutput = false;
439             EquivEntry currEntry;
440             EquivEntry* priorEntry = (key in equivHash);
441             if (priorEntry is null)
442             {
443                 isOutput = (cmdopt.atLeast <= 1);
444                 currEntry.equivID = nextEquivID;
445                 currEntry.count = 1;
446                 equivHash[key.to!string] = currEntry;
447                 nextEquivID++;
448             }
449             else
450             {
451                 (*priorEntry).count++;
452                 currEntry = *priorEntry;
453 
454                 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
455                     (cmdopt.equivMode && cmdopt.max == 0) ||
456                     (cmdopt.numberMode && cmdopt.max == 0))
457                 {
458                     isOutput = true;
459                 }
460             }
461 
462             if (isOutput)
463             {
464                 bufferedOutput.append(line);
465 
466                 if (cmdopt.equivMode)
467                 {
468                     bufferedOutput.append(cmdopt.delim);
469                     bufferedOutput.append(currEntry.equivID.to!string);
470                 }
471 
472                 if (cmdopt.numberMode)
473                 {
474                     bufferedOutput.append(cmdopt.delim);
475                     bufferedOutput.append(currEntry.count.to!string);
476                 }
477 
478                 bufferedOutput.appendln();
479             }
480         }
481     }
482 }