1 /**
2 Command line tool that identifies equivalent lines in an input stream. Equivalent
3 lines are identified using either the full line or a set of fields as the key. By
4 default, input is written to standard output, retaining only the first occurrence of
5 equivalent lines. There are also options for marking and numbering equivalent lines
6 rather, without filtering out duplicates.
7 
8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences.
9 First, the key can be composed of individual fields, not just the full line. Second,
10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when
11 they are adjacent, hence the usual need for sorting.)
12 
13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with
14 an equivalence ID, which is a one-upped counter. The other is to number lines, with
15 each unique key have its own set of numbers.
16 
17 Copyright (c) 2015-2021, eBay Inc.
18 Initially written by Jon Degenhardt
19 
20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
21 */
22 module tsv_utils.tsv_uniq;
23 
24 import std.exception : enforce;
25 import std.format : format;
26 import std.range;
27 import std.stdio;
28 import std.typecons : tuple;
29 
30 auto helpText = q"EOS
31 Synopsis: tsv-uniq [options] [file...]
32 
33 tsv-uniq filters out duplicate lines using fields as a key. Filtering is
34 based on the entire line when key fields are not provided. Options are
35 also available for assigning a unique id to each key and numbering the
36 occurrences of each key. Fields are specified using field number or field
37 name. Field names can be used when the input file has a header line.
38 
39 Use '--help-verbose' for more details.
40 Options:
41 EOS";
42 
43 auto helpTextVerbose = q"EOS
44 Synopsis: tsv-uniq [options] [file...]
45 
46 tsv-uniq identifies equivalent lines in tab-separated value files. Input
47 is read line by line, recording a key for each line based on one or more
48 of the fields. Two lines are equivalent if they have the same key. The
49 first time a key is seen its line is written to standard output.
50 Subsequent lines containing the same key are discarded. This command
51 uniq's a file on fields 2 and 3:
52 
53    tsv-uniq -f 2,3 file.tsv
54 
55 This is similar to the Unix 'uniq' program, but based on individual
56 fields and without requiring sorted data.
57 
58 Field names can be used if the input file has a header line. This command
59 uniq's a file based on the 'time' and 'date' fields:
60 
61    tsv-uniq -H -f time,date file.tsv
62 
63 Use '--help-fields' for details about field names.
64 
65 tsv-uniq can be run without specifying a key field. In this case the
66 whole line is used as a key, same as the Unix 'uniq' program. This works
67 on any line-oriented text file, not just TSV files.
68 
69 The above is the default behavior ('uniq' mode). The alternates to 'uniq'
70 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all
71 lines are written to standard output, but with a field appended marking
72 equivalent entries with an ID. The ID is a one-upped counter. Example:
73 
74    tsv-uniq --header -f 2,3 --equiv file.tsv
75 
76 'Number' mode also writes all lines to standard output, but with a field
77 appended numbering the occurrence count for the line's key. The first line
78 with a specific key is assigned the number '1', the second with the key is
79 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined.
80 
81 The '--r|repeated' option can be used to print only lines occurring more
82 than once. Specifically, the second occurrence of a key is printed. The
83 '--a|at-least N' option is similar, printing lines occurring at least N
84 times. (Like repeated, the Nth line with the key is printed.)
85 
86 The '--m|max MAX' option changes the behavior to output the first MAX
87 lines for each key, rather than just the first line for each key.
88 
89 If both '--a|at-least' and '--m|max' are specified, the occurrences
90 starting with 'at-least' and ending with 'max' are output.
91 
92 Options:
93 EOS";
94 
95 /** Container for command line options.
96  */
97 struct TsvUniqOptions
98 {
99     import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
100 
101     enum defaultEquivHeader = "equiv_id";
102     enum defaultEquivStartID = 1;
103     enum defaultNumberHeader = "equiv_line";
104 
105     string programName;
106     InputSourceRange inputSources;            /// Input files
107     size_t[] fields;                          /// Derived: --f|fields
108     bool hasHeader = false;                   /// --H|header
109     bool onlyRepeated = false;                /// --r|repeated. Shorthand for '--atleast 2'
110     size_t atLeast = 0;                       /// --a|at-least. Zero implies default behavior.
111     size_t max = 0;                           /// --m|max. Zero implies default behavior.
112     bool numberMode = false;                  /// --z|number
113     string numberHeader = defaultNumberHeader;  /// --number-header
114     bool equivMode = false;                   /// --e|equiv
115     string equivHeader = defaultEquivHeader;  /// --equiv-header
116     long equivStartID = defaultEquivStartID;  /// --equiv-start
117     bool ignoreCase = false;                  /// --i|ignore-case
118     char delim = '\t';                        /// --d|delimiter
119     bool lineBuffered = false;                /// --line-buffered
120     bool keyIsFullLine = false;               /// Derived. True if no fields specified or '--f|fields 0'
121 
122     /* Returns a tuple. First value is true if command line arguments were successfully
123      * processed and execution should continue, or false if an error occurred or the user
124      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
125      *
126      * Returning true (execution continues) means args have been validated and derived
127      * values calculated. In addition, field indices have been converted to zero-based.
128      * If the whole line is the key, the individual fields list will be cleared.
129      *
130      * Repeat count control variables 'atLeast' and max' - These values are left at zero
131      * if no repeat count options are specified. They are set if repeat count options
132      * are specified, as follows:
133      *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
134      *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
135      *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
136      *    'atLeast' if --r|repeated or --a|at-least is provided.
137      *
138      * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
139      * represents the default "output all values" case. In this case max may be less
140      * than the at-least value.
141      */
142     auto processArgs (ref string[] cmdArgs)
143     {
144         import std.algorithm : all, each;
145         import std.conv : to;
146         import std.getopt;
147         import std.path : baseName, stripExtension;
148         import std.typecons : Yes, No;
149         import tsv_utils.common.fieldlist;
150         import tsv_utils.common.utils : throwIfWindowsNewline;
151 
152         bool helpVerbose = false;         // --h|help-verbose
153         bool helpFields = false;          // --help-fields
154         bool versionWanted = false;       // --V|version
155         string fieldsArg;                 // --f|fields
156 
157         string fieldsOptionString = "f|fields";
158 
159         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
160 
161         try
162         {
163             arraySep = ",";    // Use comma to separate values in command line options
164             auto r = getopt(
165                 cmdArgs,
166                 "help-verbose",  "              Print full help.", &helpVerbose,
167                 "help-fields",   "              Print help on specifying fields.", &helpFields,
168                 std.getopt.config.caseSensitive,
169                 "V|version",     "              Print version information and exit.", &versionWanted,
170                 "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
171                 std.getopt.config.caseInsensitive,
172 
173                 fieldsOptionString,      "<field-list>  Fields to use as the key. Default: 0 (entire line).", &fieldsArg,
174 
175                 "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
176                 "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
177                 "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
178                 "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
179                 "e|equiv",       "              Output equivalence class IDs rather than uniq'ing entries.", &equivMode,
180                 "equiv-header",  "STR           Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader,
181                 "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
182                 "z|number",      "              Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode,
183                 "number-header", "STR           Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader,
184                 "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
185                 "line-buffered", "              Immediately output every line.", &lineBuffered,
186             );
187 
188             if (r.helpWanted)
189             {
190                 defaultGetoptPrinter(helpText, r.options);
191                 return tuple(false, 0);
192             }
193             else if (helpVerbose)
194             {
195                 defaultGetoptPrinter(helpTextVerbose, r.options);
196                 return tuple(false, 0);
197             }
198             else if (helpFields)
199             {
200                 writeln(fieldListHelpText);
201                 return tuple(false, 0);
202             }
203             else if (versionWanted)
204             {
205                 import tsv_utils.common.tsvutils_version;
206                 writeln(tsvutilsVersionNotice("tsv-uniq"));
207                 return tuple(false, 0);
208             }
209 
210             /* Input files. Remaining command line args are files. */
211             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
212             cmdArgs.length = 1;
213 
214             /* Validation - Do as much validation prior to header line processing as
215              * possible (avoids waiting on stdin).
216              */
217             if (!equivMode)
218             {
219                 enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv");
220                 enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv");
221             }
222 
223             enforce(numberMode || numberHeader == defaultNumberHeader,
224                     "--number-header requires --z|number");
225 
226             string[] headerFields;
227 
228             /* fieldListArgProcessing encapsulates the field list processing. It is
229              * called prior to reading the header line if headers are not being used,
230              * and after if headers are being used.
231              */
232             void fieldListArgProcessing()
233             {
234                 if (!fieldsArg.empty)
235                 {
236                     fields =
237                         fieldsArg
238                         .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
239                         (hasHeader, headerFields, fieldsOptionString)
240                         .array;
241                 }
242 
243                 enforce(fields.length <= 1 || fields.all!(x => x != 0),
244                         "Whole line as key (--f|field 0) cannot be combined with multiple fields.");
245 
246                 if (fields.length == 0)
247                 {
248                     keyIsFullLine = true;
249                 }
250                 else if (fields.length == 1 && fields[0] == 0)
251                 {
252                     keyIsFullLine = true;
253                     fields.length = 0;
254                 }
255 
256                 if (onlyRepeated && atLeast <= 1) atLeast = 2;
257                 if (atLeast >= 2 && max < atLeast)
258                 {
259                     // Don't modify max if it is zero and equivMode or numberMode is in effect.
260                     if (max != 0 || (!equivMode && !numberMode)) max = atLeast;
261                 }
262 
263                 if (!keyIsFullLine) fields.each!((ref x) => --x);  // Convert to 0-based indexing.
264             }
265 
266             if (!hasHeader) fieldListArgProcessing();
267 
268             /*
269              * Create the inputSourceRange and perform header line processing.
270              */
271             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
272             inputSources = inputSourceRange(filepaths, readHeader);
273 
274             if (hasHeader)
275             {
276                 throwIfWindowsNewline(inputSources.front.header, inputSources.front.name, 1);
277                 headerFields = inputSources.front.header.split(delim).to!(string[]);
278                 fieldListArgProcessing();
279             }
280         }
281         catch (Exception exc)
282         {
283             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
284             return tuple(false, 1);
285         }
286         return tuple(true, 0);
287     }
288 }
289 
290 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
291 
292 /** Main program. Processes command line arguments and calls tsvUniq which implements
293  * the main processing logic.
294  */
295 int main(string[] cmdArgs)
296 {
297     /* When running in DMD code coverage mode, turn on report merging. */
298     version(D_Coverage) version(DigitalMars)
299     {
300         import core.runtime : dmd_coverSetMerge;
301         dmd_coverSetMerge(true);
302     }
303 
304     TsvUniqOptions cmdopt;
305     auto r = cmdopt.processArgs(cmdArgs);
306     if (!r[0]) return r[1];
307 
308     version(LDC_Profile)
309     {
310         import ldc.profile : resetAll;
311         resetAll();
312     }
313 
314     try tsvUniq(cmdopt);
315     catch (Exception exc)
316     {
317         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
318         return 1;
319     }
320     return 0;
321 }
322 
323 /** Outputs the unique lines from all the input files.
324  *
325  * Processes the lines in each input file. All lines are added to an associated array.
326  * The first time a line is seen it is output. If key fields are being used these are
327  * used as the basis for the associative array entries rather than the full line.
328  */
329 void tsvUniq(ref TsvUniqOptions cmdopt)
330 {
331     import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange, InputFieldReordering,
332         InputSourceRange, joinAppend, LineBuffered, throwIfWindowsNewline;
333     import std.algorithm : splitter;
334     import std.array : appender;
335     import std.conv : to;
336     import std.uni : asLowerCase;
337     import std.utf : byChar;
338 
339     /* inputSources must be an InputSourceRange and include at least stdin. */
340     assert(!cmdopt.inputSources.empty);
341     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
342 
343     /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
344     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields);
345 
346     /* Both input and output are read one line at a time when in line-buffered mode. */
347     immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
348 
349     /* BufferedOutputRange is a performance enhancement for writing to stdout. */
350     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout, isLineBuffered);
351 
352     /* The master hash. The key is the specified fields concatenated together (including
353      * separators). The value is a struct with the equiv-id and occurrence count.
354      */
355     static struct EquivEntry { size_t equivID; size_t count; }
356     EquivEntry[string] equivHash;
357 
358     /* Reusable buffers for multi-field keys and case-insensitive keys. */
359     auto multiFieldKeyBuffer = appender!(char[]);
360     auto lowerKeyBuffer = appender!(char[]);
361 
362     const size_t numKeyFields = cmdopt.fields.length;
363     long nextEquivID = cmdopt.equivStartID;
364 
365     /* First header is read during command line arg processing. Flush it immediately
366      * so subsequent processes in a unix command pipeline see it early. This helps
367      * provide timely error messages.
368      */
369     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
370     {
371         auto inputStream = cmdopt.inputSources.front;
372 
373         bufferedOutput.append(inputStream.header);
374 
375         if (cmdopt.equivMode) bufferedOutput.append(cmdopt.delim, cmdopt.equivHeader);
376         if (cmdopt.numberMode) bufferedOutput.append(cmdopt.delim, cmdopt.numberHeader);
377 
378         bufferedOutput.appendln();
379         bufferedOutput.flush();
380     }
381 
382     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
383 
384     foreach (inputStream; cmdopt.inputSources)
385     {
386         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
387 
388         foreach (lineNum, line;
389                  inputStream
390                  .file
391                  .bufferedByLine(isLineBuffered)
392                  .enumerate(fileBodyStartLine))
393         {
394             if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum);
395 
396             /* Start by finding the key. */
397             typeof(line) key;
398             if (cmdopt.keyIsFullLine)
399             {
400                 key = line;
401             }
402             else
403             {
404                 assert(keyFieldsReordering !is null);
405 
406                 /* Copy the key fields to a new buffer. */
407                 keyFieldsReordering.initNewLine;
408                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
409                 {
410                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
411                     if (keyFieldsReordering.allFieldsFilled) break;
412                 }
413 
414                 enforce(keyFieldsReordering.allFieldsFilled,
415                         format("Not enough fields in line. File: %s, Line: %s",
416                                inputStream.name, lineNum));
417 
418                 if (numKeyFields == 1)
419                 {
420                     key = keyFieldsReordering.outputFields[0];
421                 }
422                 else
423                 {
424                     multiFieldKeyBuffer.clear();
425                     keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim);
426                     key = multiFieldKeyBuffer.data;
427                 }
428             }
429 
430             if (cmdopt.ignoreCase)
431             {
432                 /* Equivalent to key = key.toLower, but without memory allocation. */
433                 lowerKeyBuffer.clear();
434                 lowerKeyBuffer.put(key.asLowerCase.byChar);
435                 key = lowerKeyBuffer.data;
436             }
437 
438             bool isOutput = false;
439             EquivEntry currEntry;
440             EquivEntry* priorEntry = (key in equivHash);
441             if (priorEntry is null)
442             {
443                 isOutput = (cmdopt.atLeast <= 1);
444                 currEntry.equivID = nextEquivID;
445                 currEntry.count = 1;
446                 equivHash[key.to!string] = currEntry;
447                 nextEquivID++;
448             }
449             else
450             {
451                 (*priorEntry).count++;
452                 currEntry = *priorEntry;
453 
454                 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
455                     (cmdopt.equivMode && cmdopt.max == 0) ||
456                     (cmdopt.numberMode && cmdopt.max == 0))
457                 {
458                     isOutput = true;
459                 }
460             }
461 
462             if (isOutput)
463             {
464                 bufferedOutput.append(line);
465 
466                 if (cmdopt.equivMode)
467                 {
468                     bufferedOutput.append(cmdopt.delim, currEntry.equivID.to!string);
469                 }
470 
471                 if (cmdopt.numberMode)
472                 {
473                     bufferedOutput.append(cmdopt.delim, currEntry.count.to!string);
474                 }
475 
476                 bufferedOutput.appendln();
477             }
478         }
479     }
480 }