1 /**
2 Command line tool that identifies equivalent lines in an input stream. Equivalent
3 lines are identified using either the full line or a set of fields as the key. By
4 default, input is written to standard output, retaining only the first occurrence of
5 equivalent lines. There are also options for marking and numbering equivalent lines
6 rather, without filtering out duplicates.
7 
8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences.
9 First, the key can be composed of individual fields, not just the full line. Second,
10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when
11 they are adjacent, hence the usual need for sorting.)
12 
13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with
14 an equivalence ID, which is a one-upped counter. The other is to number lines, with
15 each unique key have its own set of numbers.
16 
17 Copyright (c) 2015-2020, eBay Inc.
18 Initially written by Jon Degenhardt
19 
20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
21 */
22 module tsv_utils.tsv_uniq;
23 
24 import std.stdio;
25 import std.format : format;
26 import std.typecons : tuple;
27 
28 auto helpText = q"EOS
29 Synopsis: tsv-uniq [options] [file...]
30 
31 tsv-uniq filters out duplicate lines using fields as a key. Filtering is
32 based on the entire line when key fields are not provided. Options are
33 also available for assigning a unique id to each key and numbering the
34 occurrences of each key. Use '--help-verbose' for more details.
35 
36 Options:
37 EOS";
38 
39 auto helpTextVerbose = q"EOS
40 Synopsis: tsv-uniq [options] [file...]
41 
42 tsv-uniq identifies equivalent lines in tab-separated value files. Input
43 is read line by line, recording a key for each line based on one or more
44 of the fields. Two lines are equivalent if they have the same key. The
45 first time a key is seen its line is written to standard output.
46 Subsequent lines containing the same key are discarded. This command
47 uniq's a file on fields 2 and 3:
48 
49    tsv-uniq -f 2,3 file.tsv
50 
51 This is similar to the Unix 'uniq' program, but based on individual
52 fields and without requiring sorted data.
53 
54 tsv-uniq can be run without specifying a key field. In this case the
55 whole line is used as a key, same as the Unix 'uniq' program. This works
56 on any line-oriented text file, not just TSV files.
57 
58 The above is the default behavior ('uniq' mode). The alternates to 'uniq'
59 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all
60 lines are written to standard output, but with a field appended marking
61 equivalent entries with an ID. The ID is a one-upped counter. Example:
62 
63    tsv-uniq --header -f 2,3 --equiv file.tsv
64 
65 'Number' mode also writes all lines to standard output, but with a field
66 appended numbering the occurrence count for the line's key. The first line
67 with a specific key is assigned the number '1', the second with the key is
68 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined.
69 
70 The '--r|repeated' option can be used to print only lines occurring more
71 than once. Specifically, the second occurrence of a key is printed. The
72 '--a|at-least N' option is similar, printing lines occurring at least N
73 times. (Like repeated, the Nth line with the key is printed.)
74 
75 The '--m|max MAX' option changes the behavior to output the first MAX
76 lines for each key, rather than just the first line for each key.
77 
78 If both '--a|at-least' and '--m|max' are specified, the occurrences
79 starting with 'at-least' and ending with 'max' are output.
80 
81 Options:
82 EOS";
83 
84 /** Container for command line options.
85  */
86 struct TsvUniqOptions
87 {
88     enum defaultEquivHeader = "equiv_id";
89     enum defaultEquivStartID = 1;
90     enum defaultNumberHeader = "equiv_line";
91 
92     string programName;
93     bool helpVerbose = false;                 // --h|help-verbose
94     bool versionWanted = false;               // --V|version
95     size_t[] fields;                          // --f|fields
96     bool hasHeader = false;                   // --H|header
97     bool onlyRepeated = false;                // --r|repeated. Shorthand for '--atleast 2'
98     size_t atLeast = 0;                       // --a|at-least. Zero implies default behavior.
99     size_t max = 0;                           // --m|max. Zero implies default behavior.
100     bool numberMode = false;                  // --z|number
101     string numberHeader = defaultNumberHeader;  // --number-header
102     bool equivMode = false;                   // --e|equiv
103     string equivHeader = defaultEquivHeader;  // --equiv-header
104     long equivStartID = defaultEquivStartID;  // --equiv-start
105     bool ignoreCase = false;                  // --i|ignore-case
106     char delim = '\t';                        // --d|delimiter
107     bool keyIsFullLine = false;               // Derived. True if no fields specified or '--f|fields 0'
108 
109     /* Returns a tuple. First value is true if command line arguments were successfully
110      * processed and execution should continue, or false if an error occurred or the user
111      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
112      *
113      * Returning true (execution continues) means args have been validated and derived
114      * values calculated. In addition, field indices have been converted to zero-based.
115      * If the whole line is the key, the individual fields list will be cleared.
116      *
117      * Repeat count control variables 'atLeast' and max' - These values are left at zero
118      * if no repeat count options are specified. They are set if repeat count options
119      * are specified, as follows:
120      *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
121      *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
122      *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
123      *    'atLeast' if --r|repeated or --a|at-least is provided.
124      *
125      * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
126      * represents the default "output all values" case. In this case max may be less
127      * than the at-least value.
128      */
129     auto processArgs (ref string[] cmdArgs)
130     {
131         import std.algorithm : any, each;
132         import std.getopt;
133         import std.path : baseName, stripExtension;
134         import std.typecons : Yes, No;
135         import tsv_utils.common.utils :  makeFieldListOptionHandler;
136 
137         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
138 
139         try
140         {
141             arraySep = ",";    // Use comma to separate values in command line options
142             auto r = getopt(
143                 cmdArgs,
144                 "help-verbose",  "              Print full help.", &helpVerbose,
145                 std.getopt.config.caseSensitive,
146                 "V|version",     "              Print version information and exit.", &versionWanted,
147                 "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
148                 std.getopt.config.caseInsensitive,
149 
150                 "f|fields",      "<field-list>  Fields to use as the key. Default: 0 (entire line).",
151                 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero),
152 
153                 "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
154                 "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
155                 "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
156                 "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
157                 "e|equiv",       "              Output equivalence class IDs rather than uniq'ing entries.", &equivMode,
158                 "equiv-header",  "STR           Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader,
159                 "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
160                 "z|number",      "              Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode,
161                 "number-header", "STR           Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader,
162                 "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
163                 );
164 
165             if (r.helpWanted)
166             {
167                 defaultGetoptPrinter(helpText, r.options);
168                 return tuple(false, 0);
169             }
170             else if (helpVerbose)
171             {
172                 defaultGetoptPrinter(helpTextVerbose, r.options);
173                 return tuple(false, 0);
174             }
175             else if (versionWanted)
176             {
177                 import tsv_utils.common.tsvutils_version;
178                 writeln(tsvutilsVersionNotice("tsv-uniq"));
179                 return tuple(false, 0);
180             }
181 
182             /* Consistency checks */
183             if (!equivMode)
184             {
185                 if (equivHeader != defaultEquivHeader)
186                 {
187                     throw new Exception("--equiv-header requires --e|equiv");
188                 }
189                 else if (equivStartID != defaultEquivStartID)
190                 {
191                     throw new Exception("--equiv-start requires --e|equiv");
192                 }
193             }
194 
195             if (!numberMode && numberHeader != defaultNumberHeader)
196             {
197                  throw new Exception("--number-header requires --z|number");
198             }
199 
200             if (fields.length > 1 && fields.any!(x => x == 0))
201             {
202                 throw new Exception("Whole line as key (--f|field 0) cannot be combined with multiple fields.");
203             }
204 
205             /* Derivations */
206             if (fields.length == 0)
207             {
208                 keyIsFullLine = true;
209             }
210             else if (fields.length == 1 && fields[0] == 0)
211             {
212                 keyIsFullLine = true;
213                 fields.length = 0;
214             }
215 
216             if (onlyRepeated && atLeast <= 1) atLeast = 2;
217             if (atLeast >= 2 && max < atLeast)
218             {
219                 // Don't modify max if it is zero and equivMode or numberMode is in effect.
220                 if (max != 0 || (!equivMode && !numberMode)) max = atLeast;
221             }
222 
223             if (!keyIsFullLine) fields.each!((ref x) => --x);  // Convert to 0-based indexing.
224 
225         }
226         catch (Exception exc)
227         {
228             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
229             return tuple(false, 1);
230         }
231         return tuple(true, 0);
232     }
233 }
234 
235 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
236 
237 /** Main program. Processes command line arguments and calls tsvUniq which implements
238  * the main processing logic.
239  */
240 int main(string[] cmdArgs)
241 {
242     /* When running in DMD code coverage mode, turn on report merging. */
243     version(D_Coverage) version(DigitalMars)
244     {
245         import core.runtime : dmd_coverSetMerge;
246         dmd_coverSetMerge(true);
247     }
248 
249     TsvUniqOptions cmdopt;
250     auto r = cmdopt.processArgs(cmdArgs);
251     if (!r[0]) return r[1];
252 
253     version(LDC_Profile)
254     {
255         import ldc.profile : resetAll;
256         resetAll();
257     }
258 
259     try tsvUniq(cmdopt, cmdArgs[1..$]);
260     catch (Exception exc)
261     {
262         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
263         return 1;
264     }
265     return 0;
266 }
267 
268 /** Outputs the unique lines from all the input files.
269  *
270  * Processes the lines in each input file. All lines are added to an associated array.
271  * The first time a line is seen it is output. If key fields are being used these are
272  * used as the basis for the associative array entries rather than the full line.
273  */
274 void tsvUniq(const TsvUniqOptions cmdopt, const string[] inputFiles)
275 {
276     import tsv_utils.common.utils : InputFieldReordering, bufferedByLine, BufferedOutputRange, joinAppend;
277     import std.algorithm : splitter;
278     import std.array : appender;
279     import std.conv : to;
280     import std.range;
281     import std.uni : asLowerCase;
282     import std.utf : byChar;
283 
284     /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
285     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields);
286 
287     /* BufferedOutputRange is a performance enhancement for writing to stdout. */
288     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
289 
290     /* The master hash. The key is the specified fields concatenated together (including
291      * separators). The value is a struct with the equiv-id and occurrence count.
292      */
293     static struct EquivEntry { size_t equivID; size_t count; }
294     EquivEntry[string] equivHash;
295 
296     /* Reusable buffers for multi-field keys and case-insensitive keys. */
297     auto multiFieldKeyBuffer = appender!(char[]);
298     auto lowerKeyBuffer = appender!(char[]);
299 
300     const size_t numKeyFields = cmdopt.fields.length;
301     long nextEquivID = cmdopt.equivStartID;
302     bool headerWritten = false;
303     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
304     {
305         auto inputStream = (filename == "-") ? stdin : filename.File();
306         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
307         {
308             if (cmdopt.hasHeader && lineNum == 1)
309             {
310                 /* Header line. */
311                 if (!headerWritten)
312                 {
313                     bufferedOutput.append(line);
314 
315                     if (cmdopt.equivMode)
316                     {
317                         bufferedOutput.append(cmdopt.delim);
318                         bufferedOutput.append(cmdopt.equivHeader);
319                     }
320 
321                     if (cmdopt.numberMode)
322                     {
323                         bufferedOutput.append(cmdopt.delim);
324                         bufferedOutput.append(cmdopt.numberHeader);
325                     }
326 
327                     bufferedOutput.appendln();
328                     headerWritten = true;
329                 }
330             }
331             else
332             {
333                 /* Regular line (not header). Start by finding the key. */
334                 typeof(line) key;
335                 if (cmdopt.keyIsFullLine)
336                 {
337                     key = line;
338                 }
339                 else
340                 {
341                     assert(keyFieldsReordering !is null);
342 
343                     /* Copy the key fields to a new buffer. */
344                     keyFieldsReordering.initNewLine;
345                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
346                     {
347                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
348                         if (keyFieldsReordering.allFieldsFilled) break;
349                     }
350 
351                     if (!keyFieldsReordering.allFieldsFilled)
352                     {
353                         throw new Exception(
354                             format("Not enough fields in line. File: %s, Line: %s",
355                                    (filename == "-") ? "Standard Input" : filename, lineNum));
356                     }
357 
358                     if (numKeyFields == 1)
359                     {
360                         key = keyFieldsReordering.outputFields[0];
361                     }
362                     else
363                     {
364                         multiFieldKeyBuffer.clear();
365                         keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim);
366                         key = multiFieldKeyBuffer.data;
367                     }
368                 }
369 
370                 if (cmdopt.ignoreCase)
371                 {
372                     /* Equivalent to key = key.toLower, but without memory allocation. */
373                     lowerKeyBuffer.clear();
374                     lowerKeyBuffer.put(key.asLowerCase.byChar);
375                     key = lowerKeyBuffer.data;
376                 }
377 
378                 bool isOutput = false;
379                 EquivEntry currEntry;
380                 EquivEntry* priorEntry = (key in equivHash);
381                 if (priorEntry is null)
382                 {
383                     isOutput = (cmdopt.atLeast <= 1);
384                     currEntry.equivID = nextEquivID;
385                     currEntry.count = 1;
386                     equivHash[key.to!string] = currEntry;
387                     nextEquivID++;
388                 }
389                 else
390                 {
391                     (*priorEntry).count++;
392                     currEntry = *priorEntry;
393 
394                     if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
395                         (cmdopt.equivMode && cmdopt.max == 0) ||
396                         (cmdopt.numberMode && cmdopt.max == 0))
397                     {
398                         isOutput = true;
399                     }
400                 }
401 
402                 if (isOutput)
403                 {
404                     bufferedOutput.append(line);
405 
406                     if (cmdopt.equivMode)
407                     {
408                         bufferedOutput.append(cmdopt.delim);
409                         bufferedOutput.append(currEntry.equivID.to!string);
410                     }
411 
412                     if (cmdopt.numberMode)
413                     {
414                         bufferedOutput.append(cmdopt.delim);
415                         bufferedOutput.append(currEntry.count.to!string);
416                     }
417 
418                     bufferedOutput.appendln();
419                 }
420             }
421         }
422     }
423 }