bin: add support for reading formulas from CSV files.

* NEWS: Mention it. * src/bin/common_finput.cc, src/bin/common_finput.hh: Implement it. * src/bin/common_output.cc, src/bin/common_output.hh: Add the %< and %> escapes. * src/bin/ltlfilt.cc: Connect %< and %> to the prefix andsuffix of the input, and document them. * src/tgbatest/det.test, src/tgbatest/nondet.test: Simplify these tests that read CSV files.
2013-11-29 09:32:51 +01:00 · 2013-11-29 09:32:51 +01:00 · 0faea814da
commit 0faea814da
parent 8c5875314c
8 changed files with 236 additions and 22 deletions
--- a/9
+++ b/9
@ -22,6 +22,15 @@ New in spot 1.2a (not released)
      output in CSV or JSON will have some extra columns to report
      the size of these input automata before ltlcross converts them
      into TGBA to perform its regular checks.
+    - ltlfilt, ltl2tgba, ltl2tgta, and ltlcross can now read formulas
+      from CSV files.  Use option -F FILE/COL to read formulas from
+      column COL of FILE.  Use -F FILE/-COL if the first line of
+      FILE be ignored.
+    - when ltlfilt processes formulas from a CSV file, it will output
+      each CSV line whose formula matches the given constraints, with
+      the rewriten formula.  The new escape sequence %< (text in
+      columns before the formula) and %> (text after) can be used
+      with the --format option to alter this output.
  * Bug fixes:
    - ltlcross' CSV output now stricly follows RFC 4180.
    - ltlcross failed to report missing input or output escape sequences
--- a/src/bin/common_finput.cc
+++ b/src/bin/common_finput.cc
@ -22,6 +22,7 @@
 #include "ltlparse/public.hh"

 #include <fstream>
+#include <cstring>

 #define OPT_LBT 1
 #define OPT_LENIENT 2
@ -34,8 +35,10 @@ static const argp_option options[] =
  {
    { 0, 0, 0, 0, "Input options:", 1 },
    { "formula", 'f', "STRING", 0, "process the formula STRING", 0 },
-    { "file", 'F', "FILENAME", 0,
-      "process each line of FILENAME as a formula", 0 },
+    { "file", 'F', "FILENAME[/COL]", 0,
+      "process each line of FILENAME as a formula; if COL is a "
+      "positive integer, assume a CSV file and read column COL; use "
+      "a negative COL to drop the first line of the CSV file", 0 },
    { "lbt-input", OPT_LBT, 0, 0,
      "read all formulas using LBT's prefix syntax", 0 },
    { "lenient", OPT_LENIENT, 0, 0,
@ -82,10 +85,22 @@ parse_formula(const std::string& s, spot::ltl::parse_error_list& pel)
 }

 job_processor::job_processor()
-  : abort_run(false)
+  : abort_run(false), real_filename(0),
+    col_to_read(0), prefix(0), suffix(0)
 {
 }

+job_processor::~job_processor()
+{
+  if (real_filename)
+    free(real_filename);
+  if (prefix)
+    free(prefix);
+  if (suffix)
+    free(suffix);
+}
+
+
 int
 job_processor::process_string(const std::string& input,
 			      const char* filename,
@ -113,9 +128,139 @@ job_processor::process_stream(std::istream& is,
  int error = 0;
  int linenum = 0;
  std::string line;
+
+  // Discard the first line of a CSV file if requested.
+  if (col_to_read < 0)
+    {
+      std::getline(is, line);
+      col_to_read = -col_to_read;
+    }
+
+  // Each line of the file and send them to process_string,
+  // optionally extracting a column of a CSV file.
  while (!abort_run && std::getline(is, line))
    if (!line.empty())
+      {
+	if (col_to_read == 0)
+	  {
 	    error |= process_string(line, filename, ++linenum);
+	  }
+	else // We are reading column COL_TO_READ in a CSV file.
+	  {
+	    // FIXME: This code assumes an entire CSV row was been
+	    // fetched by getline().  This is incorrect for processing
+	    // CSV files with fields that contain newlines inside
+	    // double-quoted strings.  Patching this code to deal with
+	    // such files is left as an exercise for the first user
+	    // who encounters the issue.
+	    const char* str = line.c_str();
+	    const char* col1_start = str;
+	    // Delimiters for the extracted column.
+	    const char* coln_start = str;
+	    const char* coln_end = 0;
+	    // The current column.  (1-based)
+	    int colnum = 1;
+	    // Whether we are parsing a double-quoted string.
+	    bool instring = false;
+	    // Note that RFC 4180 has strict rules about
+	    // double-quotes: ① if a field is double-quoted, the first
+	    // and last characters of the field should be
+	    // double-quotes; ② if a field contains a double-quote
+	    // then it should be double quoted, and the occurrences
+	    // of double-quotes should be doubled.  Therefore a CSV file
+	    // may no contain a line such as:
+	    //    foo,bar"ba""z",12
+	    // Tools have different interpretation of such a line.
+	    // For instance Python's pandas.read_csv() function will
+	    // load the second field verbatim as the string 'bar"ba""z"',
+	    // while R's read.csv() function will load it as the
+	    // string 'barba"z'.  We use this second interpretation, because
+	    // it also makes it possible to parse CSVs fields formatted
+	    // with leading spaces (often for cosmetic purpose).  When
+	    // extracting the second field of
+	    //    foo, "ba""z", 12
+	    // we will return ' baz' and the leading space will be ignored
+	    // by our LTL formula parser.
+	    while (*str)
+	      {
+		switch (*str)
+		  {
+		  case '"':
+		    // Doubled double-quotes are used to escape
+		    // double-quotes.
+		    if (instring && str[1] == '"')
+		      ++str;
+		    else
+		      instring = !instring;
+		    break;
+		  case ',':
+		    if (!instring)
+		      {
+			if (col_to_read == colnum)
+			  coln_end = str;
+			++colnum;
+			if (col_to_read == colnum)
+			  coln_start = str + 1;
+		      }
+		    break;
+		  }
+		// Once we have the end delimiter for our target
+		// column, we have all we need.
+		if (coln_end)
+		  break;
+		++str;
+	      }
+	    if (!*str)
+	      {
+		if (colnum != col_to_read)
+		  // Skip this line as it has no enough columns.
+		  continue;
+		else
+		  // The target columns ends at the end of the line.
+		  coln_end = str;
+	      }
+
+	    // Skip the line if it has an empty field.
+	    if (coln_start == coln_end)
+	      continue;
+
+	    // save the contents before and after that columns for the
+	    // %< and %> escapes (ignoring the trailing and leading
+	    // commas).
+	    prefix = (col_to_read != 1) ?
+	      strndup(col1_start, coln_start - col1_start - 1) : 0;
+	    suffix = (*coln_end != 0) ? strdup(coln_end + 1) : 0;
+	    std::string field(coln_start, coln_end);
+	    // Remove double-quotes if any.
+	    if (field.find('"') != std::string::npos)
+	      {
+		unsigned dst = 0;
+		bool instring = false;
+		for (; coln_start != coln_end; ++coln_start)
+		  if (*coln_start == '"')
+		    // A doubled double-quote instead a double-quoted
+		    // string is an escaped double-quote.
+		    if (instring && coln_start[1] == '"')
+		      field[dst++] = *++coln_start;
+		    else
+		      instring = !instring;
+		  else
+		    field[dst++] = *coln_start;
+		field.resize(dst);
+	      }
+	    error |= process_string(field, filename, ++linenum);
+	    if (prefix)
+	      {
+		free(prefix);
+		prefix = 0;
+	      }
+	    if (suffix)
+	      {
+		free(suffix);
+		suffix = 0;
+	      }
+	  }
+      }
  return error;
 }

@ -128,9 +273,46 @@ job_processor::process_file(const char* filename)

  errno = 0;
  std::ifstream input(filename);
-  if (!input)
-    error(2, errno, "cannot open '%s'", filename);
+  if (input)
    return process_stream(input, filename);
+  int saved_errno = errno;
+
+  // If we have a filename like "foo/NN" such
+  // that:
+  // ① foo/NN is not a file (already the case),
+  // ② NN is a number > 0,
+  // ③ foo is a file,
+  // then it means we want to open foo as
+  // a CSV file and process column NN.
+
+  if (const char* slash = strrchr(filename, '/'))
+    {
+      char* end;
+      errno = 0;
+      long int col = strtol(slash + 1, &end, 10);
+      // strtol ate all remaining characters and NN is positive
+      if (errno == 0 && !*end && col != 0)
+	{
+	  col_to_read = col;
+	  if (real_filename)
+	    free(real_filename);
+	  real_filename = strndup(filename, slash - filename);
+
+	  // Special case for stdin.
+	  if (real_filename[0] == '-' && real_filename[1] == 0)
+	    return process_stream(std::cin, real_filename);
+
+	  std::ifstream input(real_filename);
+	  if (input)
+	    return process_stream(input, real_filename);
+
+	  error(2, errno, "cannot open '%s' nor '%s'",
+		filename, real_filename);
+	}
+    }
+
+  error(2, saved_errno, "cannot open '%s'", filename);
+  return -1;
 }

 int
--- a/src/bin/common_finput.hh
+++ b/src/bin/common_finput.hh
@ -1,6 +1,6 @@
 // -*- coding: utf-8 -*-
-// Copyright (C) 2012 Laboratoire de Recherche et Développement de
-// l'Epita (LRDE).
+// Copyright (C) 2012, 2013 Laboratoire de Recherche et Développement
+// de l'Epita (LRDE).
 //
 // This file is part of Spot, a model checking library.
 //
@ -56,9 +56,7 @@ protected:
 public:
  job_processor();

-  virtual ~job_processor()
-  {
-  }
+  virtual ~job_processor();

  virtual int
  process_formula(const spot::ltl::formula* f,
@ -75,6 +73,11 @@ public:

  virtual int
  run();
+
+  char* real_filename;
+  long int col_to_read;
+  char* prefix;
+  char* suffix;
 };


--- a/src/bin/common_output.cc
+++ b/src/bin/common_output.cc
@ -109,6 +109,8 @@ namespace
    const spot::ltl::formula* f;
    const char* filename;
    int line;
+    const char* prefix;
+    const char* suffix;
  };

  class printable_formula:
@ -141,6 +143,8 @@ namespace
      declare('f', &fl_);
      declare('F', &filename_);
      declare('L', &line_);
+      declare('<', &prefix_);
+      declare('>', &suffix_);
      set_output(os);
    }

@ -150,6 +154,8 @@ namespace
      fl_ = &fl;
      filename_ = fl.filename ? fl.filename : "";
      line_ = fl.line;
+      prefix_ = fl.prefix ? fl.prefix : "";
+      suffix_ = fl.suffix ? fl.suffix : "";
      return format(format_);
    }

@ -158,6 +164,8 @@ namespace
    printable_formula fl_;
    spot::printable_value<const char*> filename_;
    spot::printable_value<int> line_;
+    spot::printable_value<const char*> prefix_;
+    spot::printable_value<const char*> suffix_;
  };
 }

@ -202,15 +210,20 @@ parse_opt_output(int key, char* arg, struct argp_state*)


 void
-output_formula(const spot::ltl::formula* f, const char* filename, int linenum)
+output_formula(const spot::ltl::formula* f, const char* filename, int linenum,
+	       const char* prefix, const char* suffix)
 {
  if (!format)
    {
+      if (prefix)
+	std::cout << prefix << ",";
      stream_formula(std::cout, f, filename, linenum);
+      if (suffix)
+	std::cout << "," << suffix;
    }
  else
    {
-      formula_with_location fl = { f, filename, linenum };
+      formula_with_location fl = { f, filename, linenum, prefix, suffix };
      format->print(fl);
    }

--- a/src/bin/common_output.hh
+++ b/src/bin/common_output.hh
@ -35,6 +35,7 @@ extern const struct argp output_argp;
 int parse_opt_output(int key, char* arg, struct argp_state* state);

 void output_formula(const spot::ltl::formula* f,
-		    const char* filename = 0, int linenum = 0);
+		    const char* filename = 0, int linenum = 0,
+		    const char* prefix = 0, const char* suffix = 0);

 #endif // SPOT_BIN_COMMON_OUTPUT_HH
--- a/src/bin/ltlfilt.cc
+++ b/src/bin/ltlfilt.cc
@ -165,6 +165,12 @@ static const argp_option options[] =
      "the name of the input file", 0 },
    { "%L", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
      "the original line number in the input file", 0 },
+    { "%<", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
+      "the part of the line before the formula if it "
+      "comes from a column extracted from a CSV file", 0 },
+    { "%>", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
+      "the part of the line after the formula if it "
+      "comes from a column extracted from a CSV file", 0 },
    { "%%", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
      "a single %", 0 },
    { 0, 0, 0, 0, "Miscellaneous options:", -1 },
@ -571,7 +577,7 @@ namespace
      if (matched)
 	{
 	  one_match = true;
-	  output_formula(f, filename, linenum);
+	  output_formula(f, filename, linenum, prefix, suffix);
 	}
      f->destroy();
      return 0;
@ -584,7 +590,7 @@ main(int argc, char** argv)
 {
  setup(argv);

-  const argp ap = { options, parse_opt, "[FILENAME...]",
+  const argp ap = { options, parse_opt, "[FILENAME[/COL]...]",
 		    argp_program_doc, children, 0, 0 };

  if (int err = argp_parse(&ap, argc, argv, ARGP_NO_HELP, 0, 0))
--- a/src/tgbatest/det.test
+++ b/src/tgbatest/det.test
@ -56,7 +56,7 @@ cat >formulas <<'EOF'
 1,4,G(G!a | F!c | G!b)
 EOF

-cut -d, -f3 <formulas | $ltl2tgba -x tba-det --det --stats '%d,%s,%f' -F - > out
+$ltl2tgba -x tba-det --det --stats '%d,%s,%f' -F formulas/3 > out
 diff formulas out

 cat >in.tgba <<'EOF'
--- a/src/tgbatest/nondet.test
+++ b/src/tgbatest/nondet.test
@ -28,8 +28,8 @@ a U b, 1 0
 G(!r | Fa) | Fx, 0 1
 EOF

-cut -d, -f1 expected.1 |
-../../bin/ltl2tgba -F- --stats='%f, %d %p' >out.1
+# also test the filename/COL syntax
+../../bin/ltl2tgba -F expected.1/1 --stats='%f, %d %p' >out.1
 diff out.1 expected.1

 cat >expected.2<<EOF
@ -39,8 +39,8 @@ a U b, 1 1
 G(!r | Fa) | Fx, 0 1
 EOF

-cut -d, -f1 expected.2 |
-../../bin/ltl2tgba -C -F- --stats='%f, %d %p' >out.2
+# filename/COL should also work when filename=-
+../../bin/ltl2tgba -C -F-/1 --stats='%f, %d %p' <expected.2 >out.2
 diff out.2 expected.2