From 0faea814daaa40a4b61ade748e3d1309b7d39469 Mon Sep 17 00:00:00 2001
From: Alexandre Duret-Lutz <adl@lrde.epita.fr>
Date: Fri, 29 Nov 2013 09:32:51 +0100
Subject: [PATCH] bin: add support for reading formulas from CSV files.

* NEWS: Mention it.
* src/bin/common_finput.cc, src/bin/common_finput.hh: Implement it.
* src/bin/common_output.cc, src/bin/common_output.hh: Add the %< and
%> escapes.
* src/bin/ltlfilt.cc: Connect %< and %> to the
prefix andsuffix of the input, and document them.
* src/tgbatest/det.test, src/tgbatest/nondet.test: Simplify these
tests that read CSV files.
---
 NEWS                     |   9 ++
 src/bin/common_finput.cc | 196 +++++++++++++++++++++++++++++++++++++--
 src/bin/common_finput.hh |  13 ++-
 src/bin/common_output.cc |  17 +++-
 src/bin/common_output.hh |   3 +-
 src/bin/ltlfilt.cc       |  10 +-
 src/tgbatest/det.test    |   2 +-
 src/tgbatest/nondet.test |   8 +-
 8 files changed, 236 insertions(+), 22 deletions(-)
diff --git a/NEWS b/NEWS
index 93cda64d0..ad4f3d206 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,15 @@ New in spot 1.2a (not released)
       output in CSV or JSON will have some extra columns to report
       the size of these input automata before ltlcross converts them
       into TGBA to perform its regular checks.
+    - ltlfilt, ltl2tgba, ltl2tgta, and ltlcross can now read formulas
+      from CSV files.  Use option -F FILE/COL to read formulas from
+      column COL of FILE.  Use -F FILE/-COL if the first line of
+      FILE be ignored.
+    - when ltlfilt processes formulas from a CSV file, it will output
+      each CSV line whose formula matches the given constraints, with
+      the rewriten formula.  The new escape sequence %< (text in
+      columns before the formula) and %> (text after) can be used
+      with the --format option to alter this output.
   * Bug fixes:
     - ltlcross' CSV output now stricly follows RFC 4180.
     - ltlcross failed to report missing input or output escape sequences
diff --git a/src/bin/common_finput.cc b/src/bin/common_finput.cc
index decf96fff..af9bee438 100644
--- a/src/bin/common_finput.cc
+++ b/src/bin/common_finput.cc
@@ -22,6 +22,7 @@
 #include "ltlparse/public.hh"
 
 #include <fstream>
+#include <cstring>
 
 #define OPT_LBT 1
 #define OPT_LENIENT 2
@@ -34,8 +35,10 @@ static const argp_option options[] =
   {
     { 0, 0, 0, 0, "Input options:", 1 },
     { "formula", 'f', "STRING", 0, "process the formula STRING", 0 },
-    { "file", 'F', "FILENAME", 0,
-      "process each line of FILENAME as a formula", 0 },
+    { "file", 'F', "FILENAME[/COL]", 0,
+      "process each line of FILENAME as a formula; if COL is a "
+      "positive integer, assume a CSV file and read column COL; use "
+      "a negative COL to drop the first line of the CSV file", 0 },
     { "lbt-input", OPT_LBT, 0, 0,
       "read all formulas using LBT's prefix syntax", 0 },
     { "lenient", OPT_LENIENT, 0, 0,
@@ -82,10 +85,22 @@ parse_formula(const std::string& s, spot::ltl::parse_error_list& pel)
 }
 
 job_processor::job_processor()
-  : abort_run(false)
+  : abort_run(false), real_filename(0),
+    col_to_read(0), prefix(0), suffix(0)
 {
 }
 
+job_processor::~job_processor()
+{
+  if (real_filename)
+    free(real_filename);
+  if (prefix)
+    free(prefix);
+  if (suffix)
+    free(suffix);
+}
+
+
 int
 job_processor::process_string(const std::string& input,
 			      const char* filename,
@@ -113,9 +128,139 @@ job_processor::process_stream(std::istream& is,
   int error = 0;
   int linenum = 0;
   std::string line;
+
+  // Discard the first line of a CSV file if requested.
+  if (col_to_read < 0)
+    {
+      std::getline(is, line);
+      col_to_read = -col_to_read;
+    }
+
+  // Each line of the file and send them to process_string,
+  // optionally extracting a column of a CSV file.
   while (!abort_run && std::getline(is, line))
     if (!line.empty())
-      error |= process_string(line, filename, ++linenum);
+      {
+	if (col_to_read == 0)
+	  {
+	    error |= process_string(line, filename, ++linenum);
+	  }
+	else // We are reading column COL_TO_READ in a CSV file.
+	  {
+	    // FIXME: This code assumes an entire CSV row was been
+	    // fetched by getline().  This is incorrect for processing
+	    // CSV files with fields that contain newlines inside
+	    // double-quoted strings.  Patching this code to deal with
+	    // such files is left as an exercise for the first user
+	    // who encounters the issue.
+	    const char* str = line.c_str();
+	    const char* col1_start = str;
+	    // Delimiters for the extracted column.
+	    const char* coln_start = str;
+	    const char* coln_end = 0;
+	    // The current column.  (1-based)
+	    int colnum = 1;
+	    // Whether we are parsing a double-quoted string.
+	    bool instring = false;
+	    // Note that RFC 4180 has strict rules about
+	    // double-quotes: ① if a field is double-quoted, the first
+	    // and last characters of the field should be
+	    // double-quotes; ② if a field contains a double-quote
+	    // then it should be double quoted, and the occurrences
+	    // of double-quotes should be doubled.  Therefore a CSV file
+	    // may no contain a line such as:
+	    //    foo,bar"ba""z",12
+	    // Tools have different interpretation of such a line.
+	    // For instance Python's pandas.read_csv() function will
+	    // load the second field verbatim as the string 'bar"ba""z"',
+	    // while R's read.csv() function will load it as the
+	    // string 'barba"z'.  We use this second interpretation, because
+	    // it also makes it possible to parse CSVs fields formatted
+	    // with leading spaces (often for cosmetic purpose).  When
+	    // extracting the second field of
+	    //    foo, "ba""z", 12
+	    // we will return ' baz' and the leading space will be ignored
+	    // by our LTL formula parser.
+	    while (*str)
+	      {
+		switch (*str)
+		  {
+		  case '"':
+		    // Doubled double-quotes are used to escape
+		    // double-quotes.
+		    if (instring && str[1] == '"')
+		      ++str;
+		    else
+		      instring = !instring;
+		    break;
+		  case ',':
+		    if (!instring)
+		      {
+			if (col_to_read == colnum)
+			  coln_end = str;
+			++colnum;
+			if (col_to_read == colnum)
+			  coln_start = str + 1;
+		      }
+		    break;
+		  }
+		// Once we have the end delimiter for our target
+		// column, we have all we need.
+		if (coln_end)
+		  break;
+		++str;
+	      }
+	    if (!*str)
+	      {
+		if (colnum != col_to_read)
+		  // Skip this line as it has no enough columns.
+		  continue;
+		else
+		  // The target columns ends at the end of the line.
+		  coln_end = str;
+	      }
+
+	    // Skip the line if it has an empty field.
+	    if (coln_start == coln_end)
+	      continue;
+
+	    // save the contents before and after that columns for the
+	    // %< and %> escapes (ignoring the trailing and leading
+	    // commas).
+	    prefix = (col_to_read != 1) ?
+	      strndup(col1_start, coln_start - col1_start - 1) : 0;
+	    suffix = (*coln_end != 0) ? strdup(coln_end + 1) : 0;
+	    std::string field(coln_start, coln_end);
+	    // Remove double-quotes if any.
+	    if (field.find('"') != std::string::npos)
+	      {
+		unsigned dst = 0;
+		bool instring = false;
+		for (; coln_start != coln_end; ++coln_start)
+		  if (*coln_start == '"')
+		    // A doubled double-quote instead a double-quoted
+		    // string is an escaped double-quote.
+		    if (instring && coln_start[1] == '"')
+		      field[dst++] = *++coln_start;
+		    else
+		      instring = !instring;
+		  else
+		    field[dst++] = *coln_start;
+		field.resize(dst);
+	      }
+	    error |= process_string(field, filename, ++linenum);
+	    if (prefix)
+	      {
+		free(prefix);
+		prefix = 0;
+	      }
+	    if (suffix)
+	      {
+		free(suffix);
+		suffix = 0;
+	      }
+	  }
+      }
   return error;
 }
 
@@ -128,9 +273,46 @@ job_processor::process_file(const char* filename)
 
   errno = 0;
   std::ifstream input(filename);
-  if (!input)
-    error(2, errno, "cannot open '%s'", filename);
-  return process_stream(input, filename);
+  if (input)
+    return process_stream(input, filename);
+  int saved_errno = errno;
+
+  // If we have a filename like "foo/NN" such
+  // that:
+  // ① foo/NN is not a file (already the case),
+  // ② NN is a number > 0,
+  // ③ foo is a file,
+  // then it means we want to open foo as
+  // a CSV file and process column NN.
+
+  if (const char* slash = strrchr(filename, '/'))
+    {
+      char* end;
+      errno = 0;
+      long int col = strtol(slash + 1, &end, 10);
+      // strtol ate all remaining characters and NN is positive
+      if (errno == 0 && !*end && col != 0)
+	{
+	  col_to_read = col;
+	  if (real_filename)
+	    free(real_filename);
+	  real_filename = strndup(filename, slash - filename);
+
+	  // Special case for stdin.
+	  if (real_filename[0] == '-' && real_filename[1] == 0)
+	    return process_stream(std::cin, real_filename);
+
+	  std::ifstream input(real_filename);
+	  if (input)
+	    return process_stream(input, real_filename);
+
+	  error(2, errno, "cannot open '%s' nor '%s'",
+		filename, real_filename);
+	}
+    }
+
+  error(2, saved_errno, "cannot open '%s'", filename);
+  return -1;
 }
 
 int
diff --git a/src/bin/common_finput.hh b/src/bin/common_finput.hh
index 15cdc82b6..35439b39a 100644
--- a/src/bin/common_finput.hh
+++ b/src/bin/common_finput.hh
@@ -1,6 +1,6 @@
 // -*- coding: utf-8 -*-
-// Copyright (C) 2012 Laboratoire de Recherche et Développement de
-// l'Epita (LRDE).
+// Copyright (C) 2012, 2013 Laboratoire de Recherche et Développement
+// de l'Epita (LRDE).
 //
 // This file is part of Spot, a model checking library.
 //
@@ -56,9 +56,7 @@ protected:
 public:
   job_processor();
 
-  virtual ~job_processor()
-  {
-  }
+  virtual ~job_processor();
 
   virtual int
   process_formula(const spot::ltl::formula* f,
@@ -75,6 +73,11 @@ public:
 
   virtual int
   run();
+
+  char* real_filename;
+  long int col_to_read;
+  char* prefix;
+  char* suffix;
 };
 
 
diff --git a/src/bin/common_output.cc b/src/bin/common_output.cc
index fa7718cd8..69aab4301 100644
--- a/src/bin/common_output.cc
+++ b/src/bin/common_output.cc
@@ -109,6 +109,8 @@ namespace
     const spot::ltl::formula* f;
     const char* filename;
     int line;
+    const char* prefix;
+    const char* suffix;
   };
 
   class printable_formula:
@@ -141,6 +143,8 @@ namespace
       declare('f', &fl_);
       declare('F', &filename_);
       declare('L', &line_);
+      declare('<', &prefix_);
+      declare('>', &suffix_);
       set_output(os);
     }
 
@@ -150,6 +154,8 @@ namespace
       fl_ = &fl;
       filename_ = fl.filename ? fl.filename : "";
       line_ = fl.line;
+      prefix_ = fl.prefix ? fl.prefix : "";
+      suffix_ = fl.suffix ? fl.suffix : "";
       return format(format_);
     }
 
@@ -158,6 +164,8 @@ namespace
     printable_formula fl_;
     spot::printable_value<const char*> filename_;
     spot::printable_value<int> line_;
+    spot::printable_value<const char*> prefix_;
+    spot::printable_value<const char*> suffix_;
   };
 }
 
@@ -202,15 +210,20 @@ parse_opt_output(int key, char* arg, struct argp_state*)
 
 
 void
-output_formula(const spot::ltl::formula* f, const char* filename, int linenum)
+output_formula(const spot::ltl::formula* f, const char* filename, int linenum,
+	       const char* prefix, const char* suffix)
 {
   if (!format)
     {
+      if (prefix)
+	std::cout << prefix << ",";
       stream_formula(std::cout, f, filename, linenum);
+      if (suffix)
+	std::cout << "," << suffix;
     }
   else
     {
-      formula_with_location fl = { f, filename, linenum };
+      formula_with_location fl = { f, filename, linenum, prefix, suffix };
       format->print(fl);
     }
 
diff --git a/src/bin/common_output.hh b/src/bin/common_output.hh
index cc27027ca..1714cad6f 100644
--- a/src/bin/common_output.hh
+++ b/src/bin/common_output.hh
@@ -35,6 +35,7 @@ extern const struct argp output_argp;
 int parse_opt_output(int key, char* arg, struct argp_state* state);
 
 void output_formula(const spot::ltl::formula* f,
-		    const char* filename = 0, int linenum = 0);
+		    const char* filename = 0, int linenum = 0,
+		    const char* prefix = 0, const char* suffix = 0);
 
 #endif // SPOT_BIN_COMMON_OUTPUT_HH
diff --git a/src/bin/ltlfilt.cc b/src/bin/ltlfilt.cc
index 9d9bdd5d0..901521e31 100644
--- a/src/bin/ltlfilt.cc
+++ b/src/bin/ltlfilt.cc
@@ -165,6 +165,12 @@ static const argp_option options[] =
       "the name of the input file", 0 },
     { "%L", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
       "the original line number in the input file", 0 },
+    { "%<", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
+      "the part of the line before the formula if it "
+      "comes from a column extracted from a CSV file", 0 },
+    { "%>", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
+      "the part of the line after the formula if it "
+      "comes from a column extracted from a CSV file", 0 },
     { "%%", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
       "a single %", 0 },
     { 0, 0, 0, 0, "Miscellaneous options:", -1 },
@@ -571,7 +577,7 @@ namespace
       if (matched)
 	{
 	  one_match = true;
-	  output_formula(f, filename, linenum);
+	  output_formula(f, filename, linenum, prefix, suffix);
 	}
       f->destroy();
       return 0;
@@ -584,7 +590,7 @@ main(int argc, char** argv)
 {
   setup(argv);
 
-  const argp ap = { options, parse_opt, "[FILENAME...]",
+  const argp ap = { options, parse_opt, "[FILENAME[/COL]...]",
 		    argp_program_doc, children, 0, 0 };
 
   if (int err = argp_parse(&ap, argc, argv, ARGP_NO_HELP, 0, 0))
diff --git a/src/tgbatest/det.test b/src/tgbatest/det.test
index 53fcc96ed..5e7d70915 100755
--- a/src/tgbatest/det.test
+++ b/src/tgbatest/det.test
@@ -56,7 +56,7 @@ cat >formulas <<'EOF'
 1,4,G(G!a | F!c | G!b)
 EOF
 
-cut -d, -f3 <formulas | $ltl2tgba -x tba-det --det --stats '%d,%s,%f' -F - > out
+$ltl2tgba -x tba-det --det --stats '%d,%s,%f' -F formulas/3 > out
 diff formulas out
 
 cat >in.tgba <<'EOF'
diff --git a/src/tgbatest/nondet.test b/src/tgbatest/nondet.test
index 556122a54..8057f7558 100755
--- a/src/tgbatest/nondet.test
+++ b/src/tgbatest/nondet.test
@@ -28,8 +28,8 @@ a U b, 1 0
 G(!r | Fa) | Fx, 0 1
 EOF
 
-cut -d, -f1 expected.1 |
-../../bin/ltl2tgba -F- --stats='%f, %d %p' >out.1
+# also test the filename/COL syntax
+../../bin/ltl2tgba -F expected.1/1 --stats='%f, %d %p' >out.1
 diff out.1 expected.1
 
 cat >expected.2<<EOF
@@ -39,8 +39,8 @@ a U b, 1 1
 G(!r | Fa) | Fx, 0 1
 EOF
 
-cut -d, -f1 expected.2 |
-../../bin/ltl2tgba -C -F- --stats='%f, %d %p' >out.2
+# filename/COL should also work when filename=-
+../../bin/ltl2tgba -C -F-/1 --stats='%f, %d %p' <expected.2 >out.2
 diff out.2 expected.2