Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members

diff.cc

Go to the documentation of this file.
00001 /* ===========================================================================
00002  *        Filename:  diff.c
00003  *     Description:  Find differences between two SSD trees.
00004  * 
00005  *         Version:  $Rev: 5 $
00006  *         Changed:  $Date: 2005-05-10 15:12:45 -0700 (Di, 10 Mai 2005) $
00007  *         Licence:  GPL (read COPYING file for details)
00008  * 
00009  *          Author:  Erich Schubert (eS), erich@debian.org
00010  *                   Institut für Informatik, LMU München
00011  * ========================================================================= */
00012 #include "diff.h"
00013 #include <vector>
00014 
00015 #ifdef VERBOSE
00016 # include <iostream>
00017 #endif
00018 
00019 namespace SSD {
00020 
00021 bool DiffDijkstra::fastApproximativeMode = false;
00022 
00023 /* constructor, adding the root nodes */
00024 DiffDijkstra::DiffDijkstra(Doc& eins, Doc& zwei)
00025 : Diff(eins,zwei),
00026 #ifdef VERBOSE_SEQCOUNT
00027                 seq(0),
00028 #endif
00029                 steps(0), result(NULL) {
00030 }
00031 
00032 DiffDijkstra::~DiffDijkstra() {
00033         if (result) delete result;
00034 }
00035 
00036 int
00037 process_relations(
00038                 Node* n1, Node* n2, RelCount* rc,
00039                 int dir, /* "up" or "down" relations */
00040                 const DiffDijkstraState* state,
00041                 map<NodeEqClass,int>** c /* return parameter: local credits */,
00042                 int* credits_used /* counter that will be updated */ ) {
00043 
00044         NodeVec* rn1;
00045         NodeVec* rn2;
00046         if (dir == 1) { rn1 = n1->reldown; rn2 = n2 ? n2->reldown : NULL; }
00047         if (dir == 2) { rn1 = n1->relup;   rn2 = n2 ? n2->relup   : NULL; }
00048 
00049         int cost=0;
00050         NodeVec::iterator i1, i2;
00051         /* local credits counter */
00052         map<NodeEqClass,int>* count = new map<NodeEqClass,int>;
00053         /* this set contains the nodes we must be related to in the second document */
00054         /* because we are related to them in the first */
00055         set<Node*> rel;
00056 
00057         /* first make a list of nodes we need to find in the second list */
00058         if (rn1)
00059         for (i1=rn1->begin(); i1 != rn1->end(); i1++) {
00060                 const NodeAssignments* f = state->findNodeAssignment1(*i1);
00061                 /* register node to be found */
00062                 if (f) {
00063                         /* dropping nodes always loses us the relation. */
00064                         if (f->n2) rel.insert(f->n2);
00065                         /* if f->n2 == NULL, the node was dropped, costs have been calculated before */
00066                         /* since we calculate these when dropping the first node */
00067                 } else {
00068                         /* do this also when dropping n1? */
00069                         /* still unmatched node, add to local credits */
00070                         NodeEqClass cp(*i1);
00071                         (*count)[cp] = 1 + (*count)[cp];
00072                 }
00073         }
00074         
00075         /* now check for matching nodes in the second documents relation */
00076         if (n2 && rn2)
00077         for (i2=rn2->begin(); i2 != rn2->end(); i2++) {
00078                 /* did we already map this node? */
00079                 const NodeAssignments* f = state->findNodeAssignment2(*i2);
00080                 if (f) {
00081                         /* to which node did we map it? */
00082                         set<Node*>::iterator f2 = rel.find(*i2);
00083                         if (f2 != rel.end()) {
00084                                 /* we have the matching node on the other side, too - optimal */
00085                                 /* no costs, and one node/relation less to care for */
00086                                 rel.erase(f2);
00087                         } else {
00088                                 /* we cannot retain this relation */
00089                                 if (dir == 1) {
00090                                         int lcost = rc->modify(RelEqClass(*n2,*i2), -1);
00091                                         cost += lcost; *credits_used += 1-cost;
00092                                 }
00093                                 if (dir == 2) {
00094                                         int lcost = rc->modify(RelEqClass(*i2,*n2), -1);
00095                                         cost += lcost; *credits_used += 1-cost;
00096                                 }
00097                         }
00098                 } else {
00099                         /* this node is still unmatched, add to local credits */
00100                         NodeEqClass cp(*i2);
00101                         (*count)[cp] = -1 + (*count)[cp];
00102                 }
00103         }
00104         if (!n2) {
00105                 cost *=2;
00106                 map<NodeEqClass,int>::iterator i3;
00107                 for (i3 = count->begin(); i3 != count->end(); i3++) {
00108                         i3->second *= 2;
00109                 }
00110         }
00111         /* now process remaining nodes in the first document */
00112         for(set<Node*>::iterator i = rel.begin(); i != rel.end(); i = rel.begin()) {
00113                 if (dir == 1) {
00114                         int lcost = rc->modify(RelEqClass(*n1,*i ), n2 ? +1 : +2);
00115                         cost += lcost; *credits_used += (n2 ? 1 : 2) - cost;
00116                 }
00117                 if (dir == 2) {
00118                         int lcost = rc->modify(RelEqClass(*i ,*n1), n2 ? +1 : +2);
00119                         cost += lcost; *credits_used += (n2 ? 1 : 2) - cost;
00120                 }
00121                 rel.erase(i);
00122         }
00123 
00124         /* we now have the expected (minimal) loss of relations in count */
00125         *c = count;
00126         return cost;
00127 }
00128 
00129 DiffDijkstraState*
00130 DiffDijkstra::makeState(const DiffDijkstraState* state, NodeVec::const_iterator n1, Node* n2) {
00131         int cost=0; int credits_used=0;
00132         RelCount* rc = new RelCount(*(state->credit));
00133 #ifdef VERBOSE_COSTS_2
00134         cout << "Calculating costs for matching " << (**n1) << " with ";
00135         if (n2) { cout << *n2; } else { cout << "NONE"; }
00136         cout << "." << endl;
00137 #endif
00138 
00139 //      NodeVec::iterator i1, i2;
00140         map<NodeEqClass,int>* count;
00141         cost += process_relations(*n1, n2, rc, 1, state, &count, &credits_used);
00142 #ifdef VERBOSE_COSTS_3
00143         cout << "Costs after process_relations down " << cost << endl;
00144 #endif
00145 
00146         map<NodeEqClass,int>::iterator i3;
00147         for (i3 = count->begin(); i3 != count->end(); i3++) {
00148                 int lcost = rc->modify(RelEqClass(**n1,i3->first), i3->second);
00149                 cost += lcost; credits_used = abs(i3->second) - lcost;
00150         }
00151 
00152 #ifdef VERBOSE_COSTS_4
00153         cout << "Costs with remaining nodes: " << cost << endl;
00154 #endif
00155         //count.clear();
00156         delete(count);
00157         /* up relations */
00158         cost += process_relations(*n1, n2, rc, 2, state, &count, &credits_used);
00159 #ifdef VERBOSE_COSTS_4
00160         cout << "Costs after process_relations up: " << cost << endl;
00161 #endif
00162 
00163         for (i3 = count->begin(); i3 != count->end(); i3++) {
00164                 int lcost = rc->modify(RelEqClass(i3->first,**n1), i3->second);
00165                 cost += lcost; credits_used = abs(i3->second) - lcost;
00166         }
00167         delete(count);
00168 
00169 #ifdef VERBOSE_COSTS
00170         /* calc costs */
00171         if (n2)
00172                 cout << "Cost for matching " << **n1 << " and " << *n2 << " is " << cost << endl;
00173         else 
00174                 cout << "Cost for dropping " << **n1 << " is " << cost << endl;
00175 #endif
00176 
00177         NodeAssignments* a = new NodeAssignments(*n1, n2, state->ass);
00178 
00179         NodeVec::const_iterator ni = n1; ni++;
00180         DiffDijkstraState* stat = new DiffDijkstraState(state->cost + cost,
00181                 state->credits_used + credits_used,
00182                 state->length + (n2?1:0), ni, a, rc);
00183 
00184 #ifdef VERBOSE_SEQCOUNT
00185         stat->seq = seq; seq++;
00186 #endif
00187 #ifdef TRACING_ENABLED
00188         if (searchTreeOutputStream) {
00189                 if (n2)
00190                         *searchTreeOutputStream << "Add " << stat->seq << "," << **n1 << "," << *n2 << "," << stat->cost << endl;
00191                 else
00192                         *searchTreeOutputStream << "Add " << stat->seq << "," << **n1 << ",," << stat->cost << endl;
00193         }
00194 #endif
00195         return stat;
00196 }
00197 
00198 bool
00199 DiffDijkstra::step() {
00200         if (worklist.empty()) return false;
00201 
00202         /* retrieve the current entry in the work list */
00203         DiffDijkstraState* current = *(worklist.begin());
00204         worklist.erase(worklist.begin());
00205 #ifdef TRACING_ENABLED
00206         if (searchTreeOutputStream) {
00207                 *searchTreeOutputStream << "Step " << ++steps << ": " << current->seq << "/" << seq << " (of " << worklist.size()+1 << ") cost "
00208                         << current->cost << ", len " << current->length << " ";
00209                 if (current->ass && current->ass->n1)
00210                         *searchTreeOutputStream << *(current->ass->n1);
00211                 if (current->ass && current->ass->n2)
00212                         *searchTreeOutputStream << ", " << *(current->ass->n2);
00213                 *searchTreeOutputStream << endl;
00214         }
00215 #endif
00216 
00217         if (current->iter == nodevec.end()) {
00218                 result = current;
00219                 return false;
00220         } else {
00221                 /* pos1 now points to the next node in the doc1 tree */
00222                 NodeEqClass cp(*(current->iter));
00223                 NodeVec* n = &(doc2->index_by_label[cp]);
00224 
00225                 /* add yet unmatched nodes to worklist as new steps */
00226                 for (NodeVec::iterator i = n->begin(); i != n->end(); i++)
00227                         if (*i && !current->findNodeAssignment2(*i))
00228                                 worklist.insert(makeState(current,current->iter, *i));
00229                 worklist.insert(makeState(current,current->iter, NULL));
00230                 /* delete current state */
00231                 delete current;
00232                 return true;
00233         }
00234 }
00235 
00236 bool
00237 DiffDijkstra::run() {
00238         /* calculate the credits by using the relation count */
00239         RelCount* credit = new RelCount(doc1->relcount, doc2->relcount);
00240         //cerr << *credit << endl;
00241 
00242         /* sort nodes by occurrence, low occurrence comes first */
00243         /* this small trick showed a 3* improvement in the first test - 84 steps instead of 224 */
00244         if (1) {
00245                 multimap< int, Node* > nsort;
00246 
00247                 for (NodeVec::const_iterator i=doc1->getNodesIter(); i != doc1->getNodesIterEnd(); ++i) {
00248                         int count = doc2->index_by_label[NodeEqClass(*i)].size();
00249                         nsort.insert(pair<int, Node*>(count, *i));
00250                 }
00251 
00252                 nodevec.empty();
00253                 for (multimap< int, Node*>::iterator i = nsort.begin(); i != nsort.end(); ++i) {
00254                         nodevec.push_back(i->second);
00255                 }
00256                 nsort.empty();
00257         } else {
00258                 /* just copy the first list. its private, so we can't copy it directly */
00259                 for (NodeVec::const_iterator i=doc1->getNodesIter(); i != doc1->getNodesIterEnd(); ++i)
00260                         nodevec.push_back(*i);
00261         }
00262 
00263         if (fastApproximativeMode) {
00264                 /* make start state with credits */
00265                 worklist.insert(new DiffDijkstraState(0,0,0,nodevec.begin(),NULL, credit));
00266                 while (step()) {
00267                         /* drop all other states except the first */
00268                         multiset<DiffDijkstraState*, DiffDijkstraStateQueue >::iterator wi = worklist.begin();
00269                         ++wi;
00270                         worklist.erase(wi, worklist.end());
00271                 }
00272                 /* drop any remaining element in the work queue */
00273                 while (worklist.begin() != worklist.end()) {
00274                         delete *(worklist.begin());
00275                         worklist.erase(worklist.begin());
00276                 }
00277         } else {
00278                 /* start with the root node, node matched nodes yet */
00279                 worklist.insert(new DiffDijkstraState(0,0,0,nodevec.begin(),NULL, credit));
00280                 /* process next element while not finished */
00281                 while (step()) {;};
00282                 /* drop any remaining element in the work queue */
00283                 while (worklist.begin() != worklist.end()) {
00284                         delete *(worklist.begin());
00285                         worklist.erase(worklist.begin());
00286                 }
00287         }
00288         /* return "done" */
00289         return false;
00290 }
00291 
00292 #ifdef TRACING_ENABLED
00293 /* setup and open the output stream for search tree dumping */
00294 void DiffDijkstra::setSearchTreeOutput(char* filename) {
00295         if (searchTreeOutputStream) delete(searchTreeOutputStream);
00296         searchTreeOutputStream = new std::ofstream(filename);
00297         if (!searchTreeOutputStream->is_open()) {
00298                 delete(searchTreeOutputStream);
00299                 searchTreeOutputStream = NULL;
00300         }
00301 }
00302 
00303 /* initialize stream to no output */
00304 std::ofstream* DiffDijkstra::searchTreeOutputStream = NULL;
00305 #endif
00306 
00307 NodeAssignments::NodeAssignments(Node* nn1, Node* nn2, NodeAssignments* nnext)
00308         : n1(nn1), n2(nn2), next(nnext), refcount(1) {
00309         if (next) { next->refcount++; }
00310 };
00311 /* release and destroy if needed */
00312 bool NodeAssignments::release() {
00313         refcount--;
00314         return (refcount == 0);
00315 };
00316 NodeAssignments::~NodeAssignments() {
00317         /* destroy linked elements no longer referred to */
00318         if (next) {
00319                 if (next->release()) {
00320                         delete(next);
00321                         next = NULL;
00322                 }
00323         }
00324 };
00325 
00326 const NodeAssignments* DiffDijkstraState::findNodeAssignment1(Node* n1) const {
00327         NodeAssignments* iter = ass;
00328         while(iter) {
00329                 if (iter->n1 == n1) return iter;
00330                 iter = iter->next;
00331         }
00332         return NULL;
00333 }
00334 
00335 const NodeAssignments* DiffDijkstraState::findNodeAssignment2(Node* n2) const {
00336         NodeAssignments* iter = ass;
00337         while(iter) {
00338                 if (iter->n2 == n2) return iter;
00339                 iter = iter->next;
00340         }
00341         return NULL;
00342 }
00343 
00344 }

Generated on Thu Aug 4 17:57:12 2005 for SSDDiff by  doxygen 1.4.3-20050530