Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members

doc.cc

Go to the documentation of this file.
00001 /* ===========================================================================
00002  *        Filename:  doc.cc
00003  *     Description:  Doc with XML loading capabilities
00004  * 
00005  *         Version:  $Rev: 4 $
00006  *         Changed:  $Date: 2005-05-10 15:11:59 -0700 (Di, 10 Mai 2005) $
00007  *         Licence:  GPL (read COPYING file for details)
00008  * 
00009  *          Author:  Erich Schubert (eS), erich@debian.org
00010  *                   Institut für Informatik, LMU München
00011  * ======================================================================== */
00012 #include "config.h"
00013 #include "doc.h"
00014 #include <iostream>
00015 
00016 #include <libxml/encoding.h>
00017 #include <libxml/xmlreader.h>
00018 #include <libxml/xmlIO.h>
00019 #include <libxml/tree.h>
00020 
00021 #include <libxml/xpath.h>
00022 
00023 namespace SSD {
00024 
00025 bool Doc::useWhitespace = false;
00026 
00027 /* clean the loaded document */
00028 void
00029 Doc::flushDoc() {
00030         // clean the document
00031         if (root) {
00032                 delete root; root=NULL;
00033         }
00034         if (dom) {
00035                 xmlFreeDoc(dom); dom=NULL;
00036         }
00037         nodes.clear();
00038 }
00039 
00040 void
00041 Doc::walkTreeXPath(xmlXPathContextPtr xpathctx, xmlXPathCompExprPtr xpath, Node* node) {
00042 #ifdef CAREFUL
00043         if (!node) throw "Doc::walkTreeXPath called without a node";
00044 #endif
00045 
00046         xpathctx->node = (xmlNodePtr) node->data;
00047 
00048         xmlXPathObjectPtr xpathobj = xmlXPathCompiledEval( xpath, xpathctx );
00049         if (!xpathobj) throw "Doc::walkTreeXPath: xmlXPathCompiledEval failed";
00050         //if (!xpathobj->nodesetval) throw "Doc::walkTreeXPath: xmlXPathCompiledEval didn't return result";
00051         if (xpathobj->nodesetval) {
00052                 if (xpathobj->nodesetval->nodeNr > 0)
00053                         if (!node->reldown) node->reldown = new NodeVec;
00054                 for (int i = 0; i < xpathobj->nodesetval->nodeNr; i++) {
00055                         xmlNodePtr cur = xpathobj->nodesetval->nodeTab[i];
00056                         /* find the corresponding Node in our data structure */
00057                         if (xml_to_node.find(cur) != xml_to_node.end()) {
00058                                 Node* reln = xml_to_node[cur];
00059                                 /* register node as related */
00060                                 node->reldown->push_back(reln);
00061                                 if (!reln->relup)
00062                                         reln->relup = new NodeVec;
00063                                 reln->relup->push_back(node);
00064                                 /* do document relation count */
00065                                 RelEqClass key(node, reln);
00066                                 
00067                                 hash_map<RelEqClass, int, hash_releqc>::iterator pos = relcount.find(key);
00068                                 if (pos != relcount.end()) {
00069                                         relcount[key]++;
00070                                 } else {
00071                                         relcount.insert(make_pair(key,1));
00072                                 }
00073                         }
00074                 }
00075         }
00076         xmlXPathFreeObject(xpathobj);
00077 
00078         /* recurse into children */
00079         if (node->children)
00080                 for (NodeVec::iterator i = node->children->begin(); i != node->children->end(); i++)
00081                         walkTreeXPath(xpathctx, xpath, *i);
00082 }
00083 
00084 bool
00085 Doc::loadXML(const char* filename) {
00086         xmlNodePtr rootn = NULL;
00087         if (dom) { flushDoc(); }
00088 
00089         dom = xmlParseFile(filename);
00090 
00091         if (!dom)
00092                 throw "Couldn't load document";
00093         if (!dom->doc)
00094                 throw "Couldn't load document - no doc";
00095         if (! (rootn = xmlDocGetRootElement(dom->doc)) )
00096                 throw "Couldn't load document - no root";
00097 
00098         walkTree(NULL,rootn);
00099 
00100         return true;
00101 }
00102 
00103 void
00104 Doc::processXPath(char* xp) {
00105         /* generate xpath setup */
00106         xmlXPathContextPtr xpathctx = xmlXPathNewContext(dom);
00107         if (!xpathctx) throw "Doc::processXPath - xmlXPathNewContext failed.";
00108         xmlXPathCompExprPtr xpath = xmlXPathCtxtCompile(xpathctx, BAD_CAST xp);
00109         if (!xpath) throw "Doc::processXPath - xmlXPathCtxtCompile failed. Invalid xpath expression.";
00110 
00111         /* collect relations */
00112         walkTreeXPath(xpathctx, xpath, root);
00113 
00114         xmlXPathFreeCompExpr(xpath);
00115         xmlXPathFreeContext(xpathctx);
00116 }
00117 
00118 #ifdef NEED_INDEX
00119 void
00120 Doc::add_to_index(Node* node) {
00121         index_by_label[NodeEqClass(node)].push_back(node);
00122 }
00123 #endif
00124 
00125 #ifdef NEED_PROCESSED_SET
00126 void Doc::add_to_processed(xmlNodePtr node) {
00127         processed.insert((void*) node);
00128 }
00129 #endif
00130 
00131 void
00132 Doc::walkTree(Node* pos, xmlNodePtr node) {
00133         Node* newnode=NULL;
00134         xmlAttrPtr attr=NULL;
00135         while (node) {
00136                 switch(node->type) {
00137                 case XML_ELEMENT_NODE:
00138                         newnode = appendNodeElement(pos,node);
00139 #ifdef NEED_PROCESSED_SET
00140                         add_to_processed(node);
00141 #endif
00142                         xml_to_node.insert(make_pair(node,newnode));
00143                         /* put into nodes vector */
00144                         nodes.push_back(newnode);
00145                         if (!root) { root = newnode; }
00146 #ifdef NEED_INDEX
00147                         add_to_index(newnode);
00148 #endif
00149 
00150                         // parse attributes
00151                         attr = node->properties;
00152                         while(attr) {
00153                                 Node* newattr = appendNodeAttribute(newnode,node,attr);
00154 #ifdef NEED_PROCESSED_SET
00155                                 add_to_processed((xmlNodePtr)attr);
00156 #endif
00157                                 xml_to_node.insert(make_pair((xmlNodePtr)attr,newattr));
00158                                 /* put into nodes vector */
00159                                 nodes.push_back(newattr);
00160 #ifdef NEED_INDEX
00161                                 add_to_index(newattr);
00162 #endif
00163                                 attr = attr->next;
00164                         }
00165 
00166                         walkTree(newnode,node->children);
00167                         break;
00168                 case XML_TEXT_NODE:
00169                         newnode = appendNodeText(pos,node);
00170                         if (newnode) {
00171 #ifdef NEED_PROCESSED_SET
00172                                 add_to_processed(node);
00173 #endif
00174                                 xml_to_node.insert(make_pair(node,newnode));
00175                                 /* put into nodes vector */
00176                                 nodes.push_back(newnode);
00177 #ifdef NEED_INDEX
00178                                 add_to_index(newnode);
00179 #endif
00180                         }
00181                         break;
00182                 case XML_COMMENT_NODE:
00183                         /* not really supported either, but we assume that we may
00184                          * just ignore comments */
00185                         break;
00186                 default:
00187                         std::cerr << "Unsupported node type: " << node->type << std::endl;
00188                 }
00189                 node = node->next;
00190         }
00191 }
00192 
00193 Node*
00194 Doc::appendNodeElement(Node* parent, xmlNodePtr node) {
00195         ustring name(node->name);
00196 #ifdef CAREFUL
00197         if (name.empty()) {
00198                 std::cerr << "Element node without text!"<< std::endl;
00199                 return NULL;
00200         }
00201 #endif
00202         /* create the new node */
00203         Node* newnode = new Node(name,empty_ustring,parent,node);
00204         if (parent) parent->addChild(newnode);
00205 //      std::cout << "Added labeled node '" << name << "'" << std::endl;
00206         return newnode;
00207 }
00208 
00209 Node*
00210 Doc::appendNodeText(Node* parent, xmlNodePtr node) {
00211         xmlChar* content = xmlNodeGetContent(node);
00212         ustring value(content);
00213         if (content) xmlFree(content);
00214         if (!useWhitespace && value.empty()) return NULL;
00215 
00216         /* create the new node */
00217         Node* newnode = new Node(empty_ustring,value,parent,node);
00218 #ifdef CAREFUL
00219         if (!parent) throw "No parent given for text node.";
00220 #endif
00221         parent->addChild(newnode);
00222 //      std::cout << "Added text node" << std::endl;
00223         return newnode;
00224 }
00225 
00226 Node*
00227 Doc::appendNodeAttribute(Node* parent, xmlNodePtr node, xmlAttrPtr attr) {
00228         ustring name(attr->name);
00229         ustring value(xmlNodeListGetString(node->doc,attr->children,1));
00230 #ifdef CAREFUL
00231         if (name.empty()) {
00232                 std::cerr << "Attribute node without name... " << std::endl;
00233                 return NULL;
00234         }
00235 #endif
00236 
00237         /* create the new node */
00238         Node* newnode = new Node(name,value,parent,attr);
00239 #ifdef CAREFUL
00240         if (!parent) throw "No parent given von Attribute node.";
00241 #endif
00242         parent->addChild(newnode);
00243 //      std::cout << "Added attribute node " << name << "='" << value << "'" << std::endl;
00244         return newnode;
00245 }
00246 
00247 /* destructor, flushing contents */
00248 Doc::~Doc() {
00249         flushDoc();
00250 }
00251 
00252 /* constructor */
00253 Doc::Doc() : root(NULL), dom(NULL)
00254 {
00255         // nothing to do.
00256 }
00257 
00258 } /* Namespace SSD */

Generated on Thu Aug 4 17:57:12 2005 for SSDDiff by  doxygen 1.4.3-20050530