bin: add support for reading formulas from CSV files.

* NEWS: Mention it.
* src/bin/common_finput.cc, src/bin/common_finput.hh: Implement it.
* src/bin/common_output.cc, src/bin/common_output.hh: Add the %< and
%> escapes.
* src/bin/ltlfilt.cc: Connect %< and %> to the
prefix andsuffix of the input, and document them.
* src/tgbatest/det.test, src/tgbatest/nondet.test: Simplify these
tests that read CSV files.
This commit is contained in:
Alexandre Duret-Lutz 2013-11-29 09:32:51 +01:00
parent 8c5875314c
commit 0faea814da
8 changed files with 236 additions and 22 deletions

9
NEWS
View file

@ -22,6 +22,15 @@ New in spot 1.2a (not released)
output in CSV or JSON will have some extra columns to report output in CSV or JSON will have some extra columns to report
the size of these input automata before ltlcross converts them the size of these input automata before ltlcross converts them
into TGBA to perform its regular checks. into TGBA to perform its regular checks.
- ltlfilt, ltl2tgba, ltl2tgta, and ltlcross can now read formulas
from CSV files. Use option -F FILE/COL to read formulas from
column COL of FILE. Use -F FILE/-COL if the first line of
FILE be ignored.
- when ltlfilt processes formulas from a CSV file, it will output
each CSV line whose formula matches the given constraints, with
the rewriten formula. The new escape sequence %< (text in
columns before the formula) and %> (text after) can be used
with the --format option to alter this output.
* Bug fixes: * Bug fixes:
- ltlcross' CSV output now stricly follows RFC 4180. - ltlcross' CSV output now stricly follows RFC 4180.
- ltlcross failed to report missing input or output escape sequences - ltlcross failed to report missing input or output escape sequences

View file

@ -22,6 +22,7 @@
#include "ltlparse/public.hh" #include "ltlparse/public.hh"
#include <fstream> #include <fstream>
#include <cstring>
#define OPT_LBT 1 #define OPT_LBT 1
#define OPT_LENIENT 2 #define OPT_LENIENT 2
@ -34,8 +35,10 @@ static const argp_option options[] =
{ {
{ 0, 0, 0, 0, "Input options:", 1 }, { 0, 0, 0, 0, "Input options:", 1 },
{ "formula", 'f', "STRING", 0, "process the formula STRING", 0 }, { "formula", 'f', "STRING", 0, "process the formula STRING", 0 },
{ "file", 'F', "FILENAME", 0, { "file", 'F', "FILENAME[/COL]", 0,
"process each line of FILENAME as a formula", 0 }, "process each line of FILENAME as a formula; if COL is a "
"positive integer, assume a CSV file and read column COL; use "
"a negative COL to drop the first line of the CSV file", 0 },
{ "lbt-input", OPT_LBT, 0, 0, { "lbt-input", OPT_LBT, 0, 0,
"read all formulas using LBT's prefix syntax", 0 }, "read all formulas using LBT's prefix syntax", 0 },
{ "lenient", OPT_LENIENT, 0, 0, { "lenient", OPT_LENIENT, 0, 0,
@ -82,10 +85,22 @@ parse_formula(const std::string& s, spot::ltl::parse_error_list& pel)
} }
job_processor::job_processor() job_processor::job_processor()
: abort_run(false) : abort_run(false), real_filename(0),
col_to_read(0), prefix(0), suffix(0)
{ {
} }
job_processor::~job_processor()
{
if (real_filename)
free(real_filename);
if (prefix)
free(prefix);
if (suffix)
free(suffix);
}
int int
job_processor::process_string(const std::string& input, job_processor::process_string(const std::string& input,
const char* filename, const char* filename,
@ -113,9 +128,139 @@ job_processor::process_stream(std::istream& is,
int error = 0; int error = 0;
int linenum = 0; int linenum = 0;
std::string line; std::string line;
// Discard the first line of a CSV file if requested.
if (col_to_read < 0)
{
std::getline(is, line);
col_to_read = -col_to_read;
}
// Each line of the file and send them to process_string,
// optionally extracting a column of a CSV file.
while (!abort_run && std::getline(is, line)) while (!abort_run && std::getline(is, line))
if (!line.empty()) if (!line.empty())
error |= process_string(line, filename, ++linenum); {
if (col_to_read == 0)
{
error |= process_string(line, filename, ++linenum);
}
else // We are reading column COL_TO_READ in a CSV file.
{
// FIXME: This code assumes an entire CSV row was been
// fetched by getline(). This is incorrect for processing
// CSV files with fields that contain newlines inside
// double-quoted strings. Patching this code to deal with
// such files is left as an exercise for the first user
// who encounters the issue.
const char* str = line.c_str();
const char* col1_start = str;
// Delimiters for the extracted column.
const char* coln_start = str;
const char* coln_end = 0;
// The current column. (1-based)
int colnum = 1;
// Whether we are parsing a double-quoted string.
bool instring = false;
// Note that RFC 4180 has strict rules about
// double-quotes: ① if a field is double-quoted, the first
// and last characters of the field should be
// double-quotes; ② if a field contains a double-quote
// then it should be double quoted, and the occurrences
// of double-quotes should be doubled. Therefore a CSV file
// may no contain a line such as:
// foo,bar"ba""z",12
// Tools have different interpretation of such a line.
// For instance Python's pandas.read_csv() function will
// load the second field verbatim as the string 'bar"ba""z"',
// while R's read.csv() function will load it as the
// string 'barba"z'. We use this second interpretation, because
// it also makes it possible to parse CSVs fields formatted
// with leading spaces (often for cosmetic purpose). When
// extracting the second field of
// foo, "ba""z", 12
// we will return ' baz' and the leading space will be ignored
// by our LTL formula parser.
while (*str)
{
switch (*str)
{
case '"':
// Doubled double-quotes are used to escape
// double-quotes.
if (instring && str[1] == '"')
++str;
else
instring = !instring;
break;
case ',':
if (!instring)
{
if (col_to_read == colnum)
coln_end = str;
++colnum;
if (col_to_read == colnum)
coln_start = str + 1;
}
break;
}
// Once we have the end delimiter for our target
// column, we have all we need.
if (coln_end)
break;
++str;
}
if (!*str)
{
if (colnum != col_to_read)
// Skip this line as it has no enough columns.
continue;
else
// The target columns ends at the end of the line.
coln_end = str;
}
// Skip the line if it has an empty field.
if (coln_start == coln_end)
continue;
// save the contents before and after that columns for the
// %< and %> escapes (ignoring the trailing and leading
// commas).
prefix = (col_to_read != 1) ?
strndup(col1_start, coln_start - col1_start - 1) : 0;
suffix = (*coln_end != 0) ? strdup(coln_end + 1) : 0;
std::string field(coln_start, coln_end);
// Remove double-quotes if any.
if (field.find('"') != std::string::npos)
{
unsigned dst = 0;
bool instring = false;
for (; coln_start != coln_end; ++coln_start)
if (*coln_start == '"')
// A doubled double-quote instead a double-quoted
// string is an escaped double-quote.
if (instring && coln_start[1] == '"')
field[dst++] = *++coln_start;
else
instring = !instring;
else
field[dst++] = *coln_start;
field.resize(dst);
}
error |= process_string(field, filename, ++linenum);
if (prefix)
{
free(prefix);
prefix = 0;
}
if (suffix)
{
free(suffix);
suffix = 0;
}
}
}
return error; return error;
} }
@ -128,9 +273,46 @@ job_processor::process_file(const char* filename)
errno = 0; errno = 0;
std::ifstream input(filename); std::ifstream input(filename);
if (!input) if (input)
error(2, errno, "cannot open '%s'", filename); return process_stream(input, filename);
return process_stream(input, filename); int saved_errno = errno;
// If we have a filename like "foo/NN" such
// that:
// ① foo/NN is not a file (already the case),
// ② NN is a number > 0,
// ③ foo is a file,
// then it means we want to open foo as
// a CSV file and process column NN.
if (const char* slash = strrchr(filename, '/'))
{
char* end;
errno = 0;
long int col = strtol(slash + 1, &end, 10);
// strtol ate all remaining characters and NN is positive
if (errno == 0 && !*end && col != 0)
{
col_to_read = col;
if (real_filename)
free(real_filename);
real_filename = strndup(filename, slash - filename);
// Special case for stdin.
if (real_filename[0] == '-' && real_filename[1] == 0)
return process_stream(std::cin, real_filename);
std::ifstream input(real_filename);
if (input)
return process_stream(input, real_filename);
error(2, errno, "cannot open '%s' nor '%s'",
filename, real_filename);
}
}
error(2, saved_errno, "cannot open '%s'", filename);
return -1;
} }
int int

View file

@ -1,6 +1,6 @@
// -*- coding: utf-8 -*- // -*- coding: utf-8 -*-
// Copyright (C) 2012 Laboratoire de Recherche et Développement de // Copyright (C) 2012, 2013 Laboratoire de Recherche et Développement
// l'Epita (LRDE). // de l'Epita (LRDE).
// //
// This file is part of Spot, a model checking library. // This file is part of Spot, a model checking library.
// //
@ -56,9 +56,7 @@ protected:
public: public:
job_processor(); job_processor();
virtual ~job_processor() virtual ~job_processor();
{
}
virtual int virtual int
process_formula(const spot::ltl::formula* f, process_formula(const spot::ltl::formula* f,
@ -75,6 +73,11 @@ public:
virtual int virtual int
run(); run();
char* real_filename;
long int col_to_read;
char* prefix;
char* suffix;
}; };

View file

@ -109,6 +109,8 @@ namespace
const spot::ltl::formula* f; const spot::ltl::formula* f;
const char* filename; const char* filename;
int line; int line;
const char* prefix;
const char* suffix;
}; };
class printable_formula: class printable_formula:
@ -141,6 +143,8 @@ namespace
declare('f', &fl_); declare('f', &fl_);
declare('F', &filename_); declare('F', &filename_);
declare('L', &line_); declare('L', &line_);
declare('<', &prefix_);
declare('>', &suffix_);
set_output(os); set_output(os);
} }
@ -150,6 +154,8 @@ namespace
fl_ = &fl; fl_ = &fl;
filename_ = fl.filename ? fl.filename : ""; filename_ = fl.filename ? fl.filename : "";
line_ = fl.line; line_ = fl.line;
prefix_ = fl.prefix ? fl.prefix : "";
suffix_ = fl.suffix ? fl.suffix : "";
return format(format_); return format(format_);
} }
@ -158,6 +164,8 @@ namespace
printable_formula fl_; printable_formula fl_;
spot::printable_value<const char*> filename_; spot::printable_value<const char*> filename_;
spot::printable_value<int> line_; spot::printable_value<int> line_;
spot::printable_value<const char*> prefix_;
spot::printable_value<const char*> suffix_;
}; };
} }
@ -202,15 +210,20 @@ parse_opt_output(int key, char* arg, struct argp_state*)
void void
output_formula(const spot::ltl::formula* f, const char* filename, int linenum) output_formula(const spot::ltl::formula* f, const char* filename, int linenum,
const char* prefix, const char* suffix)
{ {
if (!format) if (!format)
{ {
if (prefix)
std::cout << prefix << ",";
stream_formula(std::cout, f, filename, linenum); stream_formula(std::cout, f, filename, linenum);
if (suffix)
std::cout << "," << suffix;
} }
else else
{ {
formula_with_location fl = { f, filename, linenum }; formula_with_location fl = { f, filename, linenum, prefix, suffix };
format->print(fl); format->print(fl);
} }

View file

@ -35,6 +35,7 @@ extern const struct argp output_argp;
int parse_opt_output(int key, char* arg, struct argp_state* state); int parse_opt_output(int key, char* arg, struct argp_state* state);
void output_formula(const spot::ltl::formula* f, void output_formula(const spot::ltl::formula* f,
const char* filename = 0, int linenum = 0); const char* filename = 0, int linenum = 0,
const char* prefix = 0, const char* suffix = 0);
#endif // SPOT_BIN_COMMON_OUTPUT_HH #endif // SPOT_BIN_COMMON_OUTPUT_HH

View file

@ -165,6 +165,12 @@ static const argp_option options[] =
"the name of the input file", 0 }, "the name of the input file", 0 },
{ "%L", 0, 0, OPTION_DOC | OPTION_NO_USAGE, { "%L", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
"the original line number in the input file", 0 }, "the original line number in the input file", 0 },
{ "%<", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
"the part of the line before the formula if it "
"comes from a column extracted from a CSV file", 0 },
{ "%>", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
"the part of the line after the formula if it "
"comes from a column extracted from a CSV file", 0 },
{ "%%", 0, 0, OPTION_DOC | OPTION_NO_USAGE, { "%%", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
"a single %", 0 }, "a single %", 0 },
{ 0, 0, 0, 0, "Miscellaneous options:", -1 }, { 0, 0, 0, 0, "Miscellaneous options:", -1 },
@ -571,7 +577,7 @@ namespace
if (matched) if (matched)
{ {
one_match = true; one_match = true;
output_formula(f, filename, linenum); output_formula(f, filename, linenum, prefix, suffix);
} }
f->destroy(); f->destroy();
return 0; return 0;
@ -584,7 +590,7 @@ main(int argc, char** argv)
{ {
setup(argv); setup(argv);
const argp ap = { options, parse_opt, "[FILENAME...]", const argp ap = { options, parse_opt, "[FILENAME[/COL]...]",
argp_program_doc, children, 0, 0 }; argp_program_doc, children, 0, 0 };
if (int err = argp_parse(&ap, argc, argv, ARGP_NO_HELP, 0, 0)) if (int err = argp_parse(&ap, argc, argv, ARGP_NO_HELP, 0, 0))

View file

@ -56,7 +56,7 @@ cat >formulas <<'EOF'
1,4,G(G!a | F!c | G!b) 1,4,G(G!a | F!c | G!b)
EOF EOF
cut -d, -f3 <formulas | $ltl2tgba -x tba-det --det --stats '%d,%s,%f' -F - > out $ltl2tgba -x tba-det --det --stats '%d,%s,%f' -F formulas/3 > out
diff formulas out diff formulas out
cat >in.tgba <<'EOF' cat >in.tgba <<'EOF'

View file

@ -28,8 +28,8 @@ a U b, 1 0
G(!r | Fa) | Fx, 0 1 G(!r | Fa) | Fx, 0 1
EOF EOF
cut -d, -f1 expected.1 | # also test the filename/COL syntax
../../bin/ltl2tgba -F- --stats='%f, %d %p' >out.1 ../../bin/ltl2tgba -F expected.1/1 --stats='%f, %d %p' >out.1
diff out.1 expected.1 diff out.1 expected.1
cat >expected.2<<EOF cat >expected.2<<EOF
@ -39,8 +39,8 @@ a U b, 1 1
G(!r | Fa) | Fx, 0 1 G(!r | Fa) | Fx, 0 1
EOF EOF
cut -d, -f1 expected.2 | # filename/COL should also work when filename=-
../../bin/ltl2tgba -C -F- --stats='%f, %d %p' >out.2 ../../bin/ltl2tgba -C -F-/1 --stats='%f, %d %p' <expected.2 >out.2
diff out.2 expected.2 diff out.2 expected.2