summaryrefslogtreecommitdiff
path: root/libcutl/cutl/xml/parser.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'libcutl/cutl/xml/parser.cxx')
-rw-r--r--libcutl/cutl/xml/parser.cxx827
1 files changed, 827 insertions, 0 deletions
diff --git a/libcutl/cutl/xml/parser.cxx b/libcutl/cutl/xml/parser.cxx
new file mode 100644
index 0000000..219fb00
--- /dev/null
+++ b/libcutl/cutl/xml/parser.cxx
@@ -0,0 +1,827 @@
+// file : cutl/xml/parser.cxx
+// copyright : Copyright (c) 2009-2013 Code Synthesis Tools CC
+// license : MIT; see accompanying LICENSE file
+
+#include <new> // std::bad_alloc
+#include <cassert>
+#include <cstring> // std::strchr
+#include <istream>
+#include <ostream>
+#include <sstream>
+
+#include <cutl/xml/parser.hxx>
+
+using namespace std;
+
+namespace cutl
+{
+ namespace xml
+ {
+ // parsing
+ //
+ parsing::
+ ~parsing () throw () {}
+
+ parsing::
+ parsing (const string& n,
+ unsigned long long l,
+ unsigned long long c,
+ const string& d)
+ : name_ (n), line_ (l), column_ (c), description_ (d)
+ {
+ init ();
+ }
+
+ parsing::
+ parsing (const parser& p, const std::string& d)
+ : name_ (p.input_name ()),
+ line_ (p.line ()),
+ column_ (p.column ()),
+ description_ (d)
+ {
+ init ();
+ }
+
+ void parsing::
+ init ()
+ {
+ ostringstream os;
+ if (!name_.empty ())
+ os << name_ << ':';
+ os << line_ << ':' << column_ << ": error: " << description_;
+ what_ = os.str ();
+ }
+
+ char const* parsing::
+ what () const throw ()
+ {
+ return what_.c_str ();
+ }
+
+ // parser::event_type
+ //
+ static const char* parser_event_str[] =
+ {
+ "start element",
+ "end element",
+ "start attribute",
+ "end attribute",
+ "characters",
+ "start namespace declaration",
+ "end namespace declaration",
+ "end of file"
+ };
+
+ ostream&
+ operator<< (ostream& os, parser::event_type e)
+ {
+ return os << parser_event_str[e];
+ }
+
+ // parser
+ //
+ parser::
+ ~parser ()
+ {
+ if (p_ != 0)
+ XML_ParserFree (p_);
+ }
+
+ parser::
+ parser (istream& is, const string& iname, feature_type f)
+ : is_ (is), iname_ (iname), feature_ (f),
+ depth_ (0), state_ (state_next), event_ (eof), queue_ (eof),
+ pqname_ (&qname_), pvalue_ (&value_),
+ attr_i_ (0), start_ns_i_ (0), end_ns_i_ (0)
+ {
+ if ((feature_ & receive_attributes_map) != 0 &&
+ (feature_ & receive_attributes_event) != 0)
+ feature_ &= ~receive_attributes_map;
+
+ // Allocate the parser. Make sure nothing else can throw after
+ // this call since otherwise we will leak it.
+ //
+ p_ = XML_ParserCreateNS (0, XML_Char (' '));
+
+ if (p_ == 0)
+ throw bad_alloc ();
+
+ // Get prefixes in addition to namespaces and local names.
+ //
+ XML_SetReturnNSTriplet (p_, true);
+
+ // Set handlers.
+ //
+ XML_SetUserData(p_, this);
+
+ if ((f & receive_elements) != 0)
+ {
+ XML_SetStartElementHandler (p_, &start_element_);
+ XML_SetEndElementHandler (p_, &end_element_);
+ }
+
+ if ((f & receive_characters) != 0)
+ XML_SetCharacterDataHandler (p_, &characters_);
+
+ if ((f & receive_namespace_decls) != 0)
+ XML_SetNamespaceDeclHandler (p_,
+ &start_namespace_decl_,
+ &end_namespace_decl_);
+ }
+
+ void parser::
+ handle_error ()
+ {
+ XML_Error e (XML_GetErrorCode (p_));
+
+ if (e == XML_ERROR_ABORTED)
+ {
+ // For now we only abort the parser in the characters_() handler.
+ //
+ switch (content ())
+ {
+ case empty:
+ throw parsing (*this, "character in empty content");
+ case complex:
+ throw parsing (*this, "character in complex content");
+ default:
+ assert (false);
+ }
+ }
+ else
+ throw parsing (iname_,
+ XML_GetCurrentLineNumber (p_),
+ XML_GetCurrentColumnNumber (p_),
+ XML_ErrorString (e));
+ }
+
+ struct stream_exception_controller
+ {
+ ~stream_exception_controller ()
+ {
+ istream::iostate s = is_.rdstate ();
+ s &= ~istream::failbit;
+
+ // If our error state (sans failbit) intersects with the
+ // exception state then that means we have an active
+ // exception and changing error/exception state will
+ // cause another to be thrown.
+ //
+ if (!(old_state_ & s))
+ {
+ // Clear failbit if it was caused by eof.
+ //
+ if (is_.fail () && is_.eof ())
+ is_.clear (s);
+
+ is_.exceptions (old_state_);
+ }
+ }
+
+ stream_exception_controller (istream& is)
+ : is_ (is), old_state_ (is_.exceptions ())
+ {
+ is_.exceptions (old_state_ & ~istream::failbit);
+ }
+
+ private:
+ stream_exception_controller (const stream_exception_controller&);
+
+ stream_exception_controller&
+ operator= (const stream_exception_controller&);
+
+ private:
+ istream& is_;
+ istream::iostate old_state_;
+ };
+
+ const string& parser::
+ attribute (const qname_type& qn) const
+ {
+ if (const element_entry* e = get_element ())
+ {
+ attribute_map_type::const_iterator i (e->attr_map_.find (qn));
+
+ if (i != e->attr_map_.end ())
+ {
+ if (!i->second.handled)
+ {
+ i->second.handled = true;
+ e->attr_unhandled_--;
+ }
+ return i->second.value;
+ }
+ }
+
+ throw parsing (*this, "attribute '" + qn.string () + "' expected");
+ }
+
+ string parser::
+ attribute (const qname_type& qn, const string& dv) const
+ {
+ if (const element_entry* e = get_element ())
+ {
+ attribute_map_type::const_iterator i (e->attr_map_.find (qn));
+
+ if (i != e->attr_map_.end ())
+ {
+ if (!i->second.handled)
+ {
+ i->second.handled = true;
+ e->attr_unhandled_--;
+ }
+ return i->second.value;
+ }
+ }
+
+ return dv;
+ }
+
+ bool parser::
+ attribute_present (const qname_type& qn) const
+ {
+ if (const element_entry* e = get_element ())
+ {
+ attribute_map_type::const_iterator i (e->attr_map_.find (qn));
+
+ if (i != e->attr_map_.end ())
+ {
+ if (!i->second.handled)
+ {
+ i->second.handled = true;
+ e->attr_unhandled_--;
+ }
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ void parser::
+ next_expect (event_type e)
+ {
+ if (next () != e)
+ throw parsing (*this, string (parser_event_str[e]) + " expected");
+ }
+
+ void parser::
+ next_expect (event_type e, const string& ns, const string& n)
+ {
+ if (next () != e || namespace_ () != ns || name () != n)
+ throw parsing (*this,
+ string (parser_event_str[e]) + " '" +
+ qname_type (ns, n).string () + "' expected");
+ }
+
+ const parser::element_entry* parser::
+ get_element () const
+ {
+ // The start_element_() Expat handler may have already provisioned
+ // an entry in the element stack. In this case, we need to get the
+ // one before it, if any.
+ //
+ const element_entry* r (0);
+ element_state::size_type n (element_state_.size ());
+ if (n != 0)
+ {
+ n--;
+ if (element_state_[n].depth == depth_)
+ r = &element_state_[n];
+ else if (n != 0 && element_state_[n].depth > depth_)
+ {
+ n--;
+ if (element_state_[n].depth == depth_)
+ r = &element_state_[n];
+ }
+ }
+ return r;
+ }
+
+ void parser::
+ pop_element ()
+ {
+ // Make sure there are no unhandled attributes left.
+ //
+ const element_entry& e (element_state_.back ());
+ if (e.attr_unhandled_ != 0)
+ {
+ // Find the first unhandled attribute and report it.
+ //
+ for (attribute_map_type::const_iterator i (e.attr_map_.begin ());
+ i != e.attr_map_.end (); ++i)
+ {
+ if (!i->second.handled)
+ throw parsing (
+ *this, "unexpected attribute '" + i->first.string () + "'");
+ }
+ assert (false);
+ }
+
+ element_state_.pop_back ();
+ }
+
+ parser::event_type parser::
+ next_ (bool peek)
+ {
+ event_type e (next_body ());
+
+ // Content-specific processing. Note that we handle characters in the
+ // characters_() Expat handler for two reasons. Firstly, it is faster
+ // to ignore the whitespaces at the source. Secondly, this allows us
+ // to distinguish between element and attribute characters. We can
+ // move this processing to the handler because the characters event
+ // is never queued.
+ //
+ switch (e)
+ {
+ case end_element:
+ {
+ // If this is a peek, then avoid popping the stack just yet.
+ // This way, the attribute map will still be valid until we
+ // call next().
+ //
+ if (!peek)
+ {
+ if (!element_state_.empty () &&
+ element_state_.back ().depth == depth_)
+ pop_element ();
+
+ depth_--;
+ }
+ break;
+ }
+ case start_element:
+ {
+ const element_entry* e (get_element ());
+ switch (e != 0 ? e->content : mixed)
+ {
+ case empty:
+ throw parsing (*this, "element in empty content");
+ case simple:
+ throw parsing (*this, "element in simple content");
+ default:
+ break;
+ }
+
+ // If this is a peek, then delay adjusting the depth.
+ //
+ if (!peek)
+ depth_++;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ return e;
+ }
+
+ parser::event_type parser::
+ next_body ()
+ {
+ // See if we have any start namespace declarations we need to return.
+ //
+ if (start_ns_i_ < start_ns_.size ())
+ {
+ // Based on the previous event determine what's the next one must be.
+ //
+ switch (event_)
+ {
+ case start_namespace_decl:
+ {
+ if (++start_ns_i_ == start_ns_.size ())
+ {
+ start_ns_i_ = 0;
+ start_ns_.clear ();
+ pqname_ = &qname_;
+ break; // No more declarations.
+ }
+ // Fall through.
+ }
+ case start_element:
+ {
+ event_ = start_namespace_decl;
+ pqname_ = &start_ns_[start_ns_i_];
+ return event_;
+ }
+ default:
+ {
+ assert (false);
+ return event_ = eof;
+ }
+ }
+ }
+
+ // See if we have any attributes we need to return as events.
+ //
+ if (attr_i_ < attr_.size ())
+ {
+ // Based on the previous event determine what's the next one must be.
+ //
+ switch (event_)
+ {
+ case start_attribute:
+ {
+ event_ = characters;
+ pvalue_ = &attr_[attr_i_].value;
+ return event_;
+ }
+ case characters:
+ {
+ event_ = end_attribute; // Name is already set.
+ return event_;
+ }
+ case end_attribute:
+ {
+ if (++attr_i_ == attr_.size ())
+ {
+ attr_i_ = 0;
+ attr_.clear ();
+ pqname_ = &qname_;
+ pvalue_ = &value_;
+ break; // No more attributes.
+ }
+ // Fall through.
+ }
+ case start_element:
+ case start_namespace_decl:
+ {
+ event_ = start_attribute;
+ pqname_ = &attr_[attr_i_].qname;
+ return event_;
+ }
+ default:
+ {
+ assert (false);
+ return event_ = eof;
+ }
+ }
+ }
+
+ // See if we have any end namespace declarations we need to return.
+ //
+ if (end_ns_i_ < end_ns_.size ())
+ {
+ // Based on the previous event determine what's the next one must be.
+ //
+ switch (event_)
+ {
+ case end_namespace_decl:
+ {
+ if (++end_ns_i_ == end_ns_.size ())
+ {
+ end_ns_i_ = 0;
+ end_ns_.clear ();
+ pqname_ = &qname_;
+ break; // No more declarations.
+ }
+ // Fall through.
+ }
+ // The end namespace declaration comes before the end element
+ // which means it can follow pretty much any other event.
+ //
+ default:
+ {
+ event_ = end_namespace_decl;
+ pqname_ = &end_ns_[end_ns_i_];
+ return event_;
+ }
+ }
+ }
+
+ // Check the queue.
+ //
+ if (queue_ != eof)
+ {
+ event_ = queue_;
+ queue_ = eof;
+ return event_;
+ }
+
+ XML_ParsingStatus ps;
+ XML_GetParsingStatus (p_, &ps);
+
+ switch (ps.parsing)
+ {
+ case XML_INITIALIZED:
+ {
+ // As if we finished the previous chunk.
+ break;
+ }
+ case XML_PARSING:
+ {
+ assert (false);
+ return event_ = eof;
+ }
+ case XML_FINISHED:
+ {
+ return event_ = eof;
+ }
+ case XML_SUSPENDED:
+ {
+ switch (XML_ResumeParser (p_))
+ {
+ case XML_STATUS_SUSPENDED:
+ {
+ // If the parser is again in the suspended state, then
+ // that means we have the next event.
+ //
+ return event_;
+ }
+ case XML_STATUS_OK:
+ {
+ // Otherwise, we need to get and parse the next chunk of data
+ // unless this was the last chunk, in which case this is eof.
+ //
+ if (ps.finalBuffer)
+ return event_ = eof;
+
+ break;
+ }
+ case XML_STATUS_ERROR:
+ handle_error ();
+ }
+
+ break;
+ }
+ }
+
+ // Get and parse the next chunk of data until we get the next event
+ // or reach eof.
+ //
+ event_ = eof;
+ XML_Status s;
+ do
+ {
+ const size_t cap (4096);
+
+ char* b (static_cast<char*> (XML_GetBuffer (p_, cap)));
+ if (b == 0)
+ throw bad_alloc ();
+
+ // Temporarily unset the exception failbit. Also clear the fail bit
+ // when we reset the old state if it was caused by eof.
+ //
+ {
+ stream_exception_controller sec (is_);
+ is_.read (b, static_cast<streamsize> (cap));
+ }
+
+ s = XML_ParseBuffer (p_, static_cast<int> (is_.gcount ()), is_.eof ());
+
+ if (s == XML_STATUS_ERROR)
+ handle_error ();
+
+ } while (s != XML_STATUS_SUSPENDED && !is_.eof ());
+
+ return event_;
+ }
+
+ static void
+ split_name (const XML_Char* s, qname& qn)
+ {
+ string& ns (qn.namespace_ ());
+ string& name (qn.name ());
+ string& prefix (qn.prefix ());
+
+ const char* p (strchr (s, ' '));
+
+ if (p == 0)
+ {
+ ns.clear ();
+ name = s;
+ prefix.clear ();
+ }
+ else
+ {
+ ns.assign (s, 0, p - s);
+
+ s = p + 1;
+ p = strchr (s, ' ');
+
+ if (p == 0)
+ {
+ name = s;
+ prefix.clear ();
+ }
+ else
+ {
+ name.assign (s, 0, p - s);
+ prefix = p + 1;
+ }
+ }
+ }
+
+ void XMLCALL parser::
+ start_element_ (void* v, const XML_Char* name, const XML_Char** atts)
+ {
+ parser& p (*static_cast<parser*> (v));
+
+ XML_ParsingStatus ps;
+ XML_GetParsingStatus (p.p_, &ps);
+
+ // Expat has a (mis)-feature of a possibily calling handlers even
+ // after the non-resumable XML_StopParser call.
+ //
+ if (ps.parsing == XML_FINISHED)
+ return;
+
+ // Cannot be a followup event.
+ //
+ assert (ps.parsing == XML_PARSING);
+
+ p.event_ = start_element;
+ split_name (name, p.qname_);
+
+ p.line_ = XML_GetCurrentLineNumber (p.p_);
+ p.column_ = XML_GetCurrentColumnNumber (p.p_);
+
+ // Handle attributes.
+ //
+ if (*atts != 0)
+ {
+ bool am ((p.feature_ & receive_attributes_map) != 0);
+ bool ae ((p.feature_ & receive_attributes_event) != 0);
+
+ // Provision an entry for this element.
+ //
+ element_entry* pe (0);
+ if (am)
+ {
+ p.element_state_.push_back (element_entry (p.depth_ + 1));
+ pe = &p.element_state_.back ();
+ }
+
+ if (am || ae)
+ {
+ for (; *atts != 0; atts += 2)
+ {
+ if (am)
+ {
+ qname_type qn;
+ split_name (*atts, qn);
+ attribute_map_type::value_type v (qn, attribute_value_type ());
+ v.second.value = *(atts + 1);
+ v.second.handled = false;
+ pe->attr_map_.insert (v);
+ }
+ else
+ {
+ p.attr_.push_back (attribute_type ());
+ split_name (*atts, p.attr_.back ().qname);
+ p.attr_.back ().value = *(atts + 1);
+ }
+ }
+
+ if (am)
+ pe->attr_unhandled_ = pe->attr_map_.size ();
+ }
+ }
+
+ XML_StopParser (p.p_, true);
+ }
+
+ void XMLCALL parser::
+ end_element_ (void* v, const XML_Char* name)
+ {
+ parser& p (*static_cast<parser*> (v));
+
+ XML_ParsingStatus ps;
+ XML_GetParsingStatus (p.p_, &ps);
+
+ // Expat has a (mis)-feature of a possibily calling handlers even
+ // after the non-resumable XML_StopParser call.
+ //
+ if (ps.parsing == XML_FINISHED)
+ return;
+
+ // This can be a followup event for empty elements (<foo/>). In this
+ // case the element name is already set.
+ //
+ if (ps.parsing != XML_PARSING)
+ p.queue_ = end_element;
+ else
+ {
+ // We may also have the end namespace declaration events which
+ // should come before the end element. If that's the case, then
+ // queue the end element and return the end namespace as the next
+ // event.
+ //
+ if (p.end_ns_i_ < p.end_ns_.size ())
+ {
+ p.event_ = end_namespace_decl;
+ p.queue_ = end_element;
+ }
+ else
+ p.event_ = end_element;
+
+ split_name (name, p.qname_);
+
+ p.line_ = XML_GetCurrentLineNumber (p.p_);
+ p.column_ = XML_GetCurrentColumnNumber (p.p_);
+
+ XML_StopParser (p.p_, true);
+ }
+ }
+
+ void XMLCALL parser::
+ characters_ (void* v, const XML_Char* s, int n)
+ {
+ parser& p (*static_cast<parser*> (v));
+
+ XML_ParsingStatus ps;
+ XML_GetParsingStatus (p.p_, &ps);
+
+ // Expat has a (mis)-feature of a possibily calling handlers even
+ // after the non-resumable XML_StopParser call.
+ //
+ if (ps.parsing == XML_FINISHED)
+ return;
+
+ // If this is empty or complex content, see if these are whitespaces.
+ //
+ switch (p.content ())
+ {
+ case empty:
+ case complex:
+ {
+ for (int i (0); i != n; ++i)
+ {
+ char c (s[i]);
+ if (c == 0x20 || c == 0x0A || c == 0x0D || c == 0x09)
+ continue;
+
+ // It would have been easier to throw the exception directly,
+ // however, the Expat code is most likely not exception safe.
+ //
+ p.line_ = XML_GetCurrentLineNumber (p.p_);
+ p.column_ = XML_GetCurrentColumnNumber (p.p_);
+ XML_StopParser (p.p_, false);
+ break;
+ }
+ return;
+ }
+ default:
+ break;
+ }
+
+ // This can be a followup event for another character event. In
+ // this case simply append the data.
+ //
+ if (ps.parsing != XML_PARSING)
+ {
+ assert (p.event_ == characters);
+ p.value_.append (s, n);
+ }
+ else
+ {
+ p.event_ = characters;
+ p.value_.assign (s, n);
+
+ p.line_ = XML_GetCurrentLineNumber (p.p_);
+ p.column_ = XML_GetCurrentColumnNumber (p.p_);
+
+ XML_StopParser (p.p_, true);
+ }
+ }
+
+ void XMLCALL parser::
+ start_namespace_decl_ (void* v, const XML_Char* prefix, const XML_Char* ns)
+ {
+ parser& p (*static_cast<parser*> (v));
+
+ XML_ParsingStatus ps;
+ XML_GetParsingStatus (p.p_, &ps);
+
+ // Expat has a (mis)-feature of a possibily calling handlers even
+ // after the non-resumable XML_StopParser call.
+ //
+ if (ps.parsing == XML_FINISHED)
+ return;
+
+ p.start_ns_.push_back (qname_type ());
+ p.start_ns_.back ().prefix () = (prefix != 0 ? prefix : "");
+ p.start_ns_.back ().namespace_ () = (ns != 0 ? ns : "");
+ }
+
+ void XMLCALL parser::
+ end_namespace_decl_ (void* v, const XML_Char* prefix)
+ {
+ parser& p (*static_cast<parser*> (v));
+
+ XML_ParsingStatus ps;
+ XML_GetParsingStatus (p.p_, &ps);
+
+ // Expat has a (mis)-feature of a possibily calling handlers even
+ // after the non-resumable XML_StopParser call.
+ //
+ if (ps.parsing == XML_FINISHED)
+ return;
+
+ p.end_ns_.push_back (qname_type ());
+ p.end_ns_.back ().prefix () = (prefix != 0 ? prefix : "");
+ }
+ }
+}