1 /**
    2  *  This file contains code that takes two strings and performs a difference on them,
    3  *  then creates delta operations and appends them to the delta parent node.
    4  *
    5  *  The elements used to describe delta operations are:
    6  *
    7  *    - <tr /> for text replace
    8  *    - <ti /> for text insertion
    9  *    - <td /> for text deletion
   10  */
   11 
   12 /*
   13  * XyStrDiff.hpp
   14  */
   15 
   16 #ifndef XyStrDiff_HXX__
   17 #define XyStrDiff_HXX__
   18 
   19 #include <stdio.h>
   20 #include <iostream>
   21 #include <string>
   22 #include <vector>
   23 
   24 #include "xercesc/dom/DOMDocument.hpp"
   25 #include "xercesc/dom/DOMElement.hpp"
   26 #include "xercesc/dom/DOMNodeFilter.hpp"
   27 
   28 class XyStrDiff {
   29 
   30     /**
   31      * Used to indicate what operation is performed on the current character
   32      */
   33     static const int NOOP = 0;
   34     static const int SUB  = 1;
   35     static const int INS  = 2;
   36     static const int DEL  = 3;
   37 
   38 
   39     public:
   40         /**
   41          * Constructor
   42          */
   43         XyStrDiff(xercesc::DOMDocument *myDoc, xercesc::DOMElement *elem, const XMLCh *x, const XMLCh *y);
   44 
   45         /**
   46          * Destructor
   47          */
   48         ~XyStrDiff();
   49 
   50         /**
   51          * Triggers the delta construction
   52          */
   53         void addDeltaOperations();
   54 
   55     private :
   56         /**
   57          * @see http://en.wikipedia.org/wiki/Levenshtein_distance
   58          */
   59         void LevenshteinDistance();
   60 
   61         /**
   62          * Backtraces through the LCS matrix created in LevenshteinDistance() to construct
   63          * the delta operations
   64          */
   65         void calculatePath(int i=-1, int j=-1);
   66 
   67         /**
   68          * Registers a character with either the insert or delete buffers
   69          */
   70         void registerBuffer(int i, int optype, XMLCh chr);
   71 
   72         /**
   73          * Flush the buffer of insert and delete text and create a delta operation element
   74          */
   75         void flushBuffers();
   76 
   77         /**
   78          * Used in the event that either string is longer than 65535. Simply creates one <tr /> operation
   79          */
   80         void simpleReplace();
   81 
   82         /**
   83          * The root element to which the delta operations will be appended
   84          */
   85         xercesc::DOMElement *root;
   86 
   87         /**
   88          * The ownerDocument of this.root
   89          */
   90         xercesc::DOMDocument *doc;
   91 
   92         /**
   93          * The current xpos and ypos used in the LCS walk of calculatePath()
   94          */ 
   95         int xpos, ypos;
   96 
   97         /**
   98          * the strX contructor argument run through XMLString::replication
   99          */
  100         XMLCh *x;
  101 
  102         /**
  103          * the strY contructor argument run through XMLString::replication
  104          */
  105         XMLCh *y;
  106 
  107         /**
  108          * An array used to store the path calculations
  109          * of the LevenshteinDistance distance algorithm
  110          */
  111         unsigned short *c;
  112 
  113         /**
  114          * One of the class constants above
  115          */
  116         int currop;
  117 
  118         /**
  119          * A buffer containing the text to be inserted or
  120          * used to replace in the next operation
  121          */
  122         std::basic_string<XMLCh> insbuf;
  123 
  124         /**
  125          * A buffer containing the text that will be
  126          * deleted or replaced in the next operation
  127          */
  128         std::basic_string<XMLCh> delbuf;
  129         int sizex, sizey;
  130 };
  131 
  132 #endif
  133 
  134 /*
  135  *  XyStrDiff.cpp
  136  */
  137 
  138 #include "include/XyStrDiff.hpp"
  139 
  140 
  141 
  142 /*
  143  * XyStrDiff functions (character-by-character string diffs)
  144  */
  145 
  146 XERCES_CPP_NAMESPACE_USE 
  147 
  148 static const XMLCh gLS[] = { chLatin_L, chLatin_S, chNull };
  149 
  150 XyStrDiff::XyStrDiff(DOMDocument *myDoc, DOMElement *elem, const XMLCh *strX, const XMLCh *strY)
  151 {
  152     doc = myDoc;
  153     root = elem;
  154 
  155     sizex = XMLString::stringLen(strX);
  156     sizey = XMLString::stringLen(strY);
  157 
  158     x = XMLString::replicate(strX);
  159     y = XMLString::replicate(strY);
  160 
  161     n = sizex;
  162     m = sizey;
  163     c = NULL;
  164 
  165     currop = -1;
  166 }
  167 
  168 /*
  169  * Destructor
  170  */
  171 
  172 XyStrDiff::~XyStrDiff(void)
  173 {
  174     if (c != NULL) {
  175         free(c);
  176     }
  177     if (x != NULL) {
  178         XMLString::release(&x);
  179     }
  180     if (y != NULL) {
  181         XMLString::release(&y);
  182     }
  183 }
  184 
  185 void XyStrDiff::addDeltaOperations()
  186 {
  187     this->LevenshteinDistance();
  188     this->calculatePath();
  189     this->flushBuffers();
  190 }
  191 
  192 void XyStrDiff::LevenshteinDistance()
  193 {
  194     if (n > 65535 || m > 65535) {
  195         // do a simple replace because of memory constraints
  196         this->simpleReplace();
  197         return;
  198     }
  199 
  200     if (c == NULL) {
  201         int cmalloclen = (sizeof(unsigned short))*(sizex+1)*(sizey+1);
  202         c = (unsigned short*) malloc(cmalloclen);
  203     }
  204 
  205     // Step 1
  206     unsigned short k, i, j;
  207     
  208     n = XMLString::stringLen(x);
  209     m = XMLString::stringLen(y);
  210 
  211     if (n != 0 && m != 0) {
  212         m++;
  213         n++;
  214         // Step 1: Set up an empty matrix
  215         for(k = 0; k < n; k++) {
  216             c[k] = 0;
  217         }
  218         for(k = 0; k < m; k++) {
  219             c[k*n] = 0;
  220         }
  221         int del, ins, sub, a, b;
  222         // Step 3 and 4: Replace 
  223         for(i = 1; i < n; i++) {
  224             for(j = 1; j < m; j++) {
  225                 // Same character, nothing to see here
  226                 if (x[i-1] == y[j-1]) {
  227                     c[j*n+i] = c[(j-1)*n + i-1] + 1;
  228                 } else {
  229                     a = c[(j-1)*n + i];         // deletion
  230                     b = c[j*n + i-1];           // substitution
  231                     c[j*n+i] = (a > b) ? a : b; // Base on which is less costly
  232                 }
  233             }
  234         }
  235     }
  236 }
  237 
  238 void XyStrDiff::calculatePath(int i, int j)
  239 {
  240     int malloclen = (sizeof(int))*4*(sizex + 1 + sizey+1);
  241 
  242     // ops is a 2-dimensional integer array with 4 values per row
  243     // 0 - xpos, the first parameter passed to registerBuffer
  244     // 1 - the operation (integer constant, XyStrDiff::NOOP, XyStrDiff::INS, or XyStrDiff::DEL)
  245     // 2 - whether to use x or y as the string from which we are passing the character
  246     //    1 = x, 2 = y
  247     // 3 - the character position within the string determined by (2) that is passed to registerBuffer()
  248 
  249     int *ops = (int *) malloc(malloclen);
  250 
  251     if (i == -1) i = sizex;
  252     if (j == -1) j = sizey;
  253 
  254     int opnum = 0;
  255     while (i >= 0 && j >= 0) {
  256         if (i > 0 && j > 0 && (x[i-1] == y[j-1])) {
  257             ops[opnum*4 + 0] = i-1;
  258             ops[opnum*4 + 1] = XyStrDiff::NOOP;
  259             ops[opnum*4 + 2] = 1;
  260             ops[opnum*4 + 3] = i-1;
  261             i = i-1;
  262             j = j-1;
  263             opnum++;
  264             continue;               
  265         } else {
  266             if (j > 0 && (i == 0 || c[(j-1)*n+i] >= c[j*n+i-1])) {
  267                 ops[opnum*4 + 0] = i;
  268                 ops[opnum*4 + 1] = XyStrDiff::INS;
  269                 ops[opnum*4 + 2] = 2;
  270                 ops[opnum*4 + 3] = j-1;
  271                 j = j-1;
  272                 opnum++;
  273                 continue;
  274             } else if (i > 0 && (j == 0 || c[(j-1)*n+i] < c[j*n+i-1])) {
  275                 if (i == sizex && j == sizey) {
  276                     ops[opnum*4 + 0] = i;
  277                 } else {
  278                     ops[opnum*4 + 0] = i-1;
  279                 }
  280                 ops[opnum*4 + 1] = XyStrDiff::DEL;
  281                 ops[opnum*4 + 2] = 1;
  282                 ops[opnum*4 + 3] = i-1;
  283                 i = i - 1;
  284                 opnum++;
  285                 continue;
  286             }
  287         }
  288         break;
  289     }
  290 
  291     int currXpos;
  292     int optype;
  293     int charpos;
  294     int z;
  295     
  296     for (z = opnum-1; z >= 0; z--) {
  297         currXpos = ops[z*4 + 0];
  298         optype   = ops[z*4 + 1];
  299         charpos  = ops[z*4 + 3];
  300         if (ops[z*4 + 2] == 1) {
  301             this->registerBuffer(currXpos, optype, x[charpos]);
  302         } else {
  303             this->registerBuffer(currXpos, optype, y[charpos]);
  304         }
  305     }
  306 }
  307 
  308 void XyStrDiff::registerBuffer(int currXpos, int optype, XMLCh chr)
  309 {
  310 
  311     xpos = currXpos;
  312 
  313     if (currop == -1) {
  314         currop = optype
  315     }
  316 
  317     switch (currop) {
  318 
  319         case XyStrDiff::SUB:
  320             if (optype == XyStrDiff::DEL) {
  321                 delbuf += chr;
  322             } else if (optype == XyStrDiff::INS) {
  323                 insbuf += chr;
  324             } else {
  325                 this->flushBuffers();
  326                 currop = optype;
  327             }
  328             break;
  329 
  330         case XyStrDiff::DEL:
  331             currop = optype;
  332             delbuf += chr;
  333             break;
  334 
  335         case XyStrDiff::INS:
  336             currop = (currop == XyStrDiff::DEL) ? XyStrDiff::SUB : XyStrDiff::INS;
  337             insbuf += chr;
  338             break;
  339 
  340         case XyStrDiff::NOOP:
  341             this->flushBuffers();
  342             currop = optype;
  343             break;
  344 
  345     }
  346 }
  347 
  348 void XyStrDiff::simpleReplace()
  349 {
  350     int startpos, len;
  351     XMLCh tempStrA[100];
  352     XMLCh tempStrB[100];
  353 
  354     startpos = 0;
  355     len = sizex;
  356     try {
  357         XMLString::transcode("tr", tempStrA, 99);
  358         DOMElement *r = doc->createElement(tempStrA);
  359         XMLString::transcode("pos", tempStrA, 99);
  360         XMLString::transcode(itoa(startpos).c_str(), tempStrB, 99);
  361         r->setAttribute(tempStrA, tempStrB);
  362         XMLString::transcode("len", tempStrA, 99);
  363         XMLString::transcode(itoa(len).c_str(), tempStrB, 99);
  364         r->setAttribute(tempStrA, tempStrB);
  365 
  366         DOMText *textNode = doc->createTextNode(y);
  367 
  368         r->appendChild((DOMNode *)textNode);
  369         root->appendChild((DOMNode *)r);
  370     }
  371     catch (const XMLException& toCatch) {
  372         std::cout << "XMLException: " << XMLString::transcode(toCatch.getMessage()) << std::endl;
  373     }
  374     catch (const DOMException& toCatch) {
  375         std::cout << "DOMException: " << XMLString::transcode(toCatch.getMessage()) << std::endl;
  376     }
  377     catch (...) {
  378         std::cout << "Unexpected Exception: " << std::endl;
  379     }
  380 }
  381 
  382 
  383 void XyStrDiff::flushBuffers()
  384 {
  385     int startpos, len;
  386     XMLCh tempStrA[100];
  387     XMLCh tempStrB[100];
  388     
  389     if (currop == XyStrDiff::NOOP) {
  390         return;
  391     }
  392 
  393     if (currop == XyStrDiff::SUB) {
  394 
  395         len = delbuf.length();
  396         startpos = xpos - len;
  397         
  398         try {
  399             XMLString::transcode("tr", tempStrA, 99);
  400             DOMElement *r = doc->createElement(tempStrA);
  401             XMLString::transcode("pos", tempStrA, 99);
  402             XMLString::transcode(itoa(startpos).c_str(), tempStrB, 99);
  403             r->setAttribute(tempStrA, tempStrB);
  404             XMLString::transcode("len", tempStrA, 99);
  405             XMLString::transcode(itoa(len).c_str(), tempStrB, 99);
  406             r->setAttribute(tempStrA, tempStrB);
  407 
  408             DOMText *textNode = doc->createTextNode(insbuf.c_str());
  409 
  410             r->appendChild((DOMNode *)textNode);
  411             root->appendChild((DOMNode *)r);
  412         }
  413         catch (const XMLException& toCatch) {
  414             std::cout << "XMLException: " << XMLString::transcode(toCatch.getMessage()) << std::endl;
  415         }
  416         catch (const DOMException& toCatch) {
  417             std::cout << "DOMException: " << XMLString::transcode(toCatch.getMessage()) << std::endl;
  418         }
  419         catch (...) {
  420             std::cout << "Unexpected Exception: " << std::endl;
  421         }
  422 
  423         delbuf.clear();
  424         insbuf.clear();
  425 
  426     } else if (currop == XyStrDiff::INS) {
  427 
  428         startpos = xpos;
  429         
  430         try {
  431             XMLString::transcode("ti", tempStrA, 99);
  432             DOMElement *r = doc->createElement(tempStrA);
  433 
  434             XMLString::transcode("pos", tempStrA, 99);
  435             XMLString::transcode(itoa(startpos).c_str(), tempStrB, 99);
  436             r->setAttribute(tempStrA, tempStrB);
  437 
  438             DOMText *textNode = doc->createTextNode(insbuf.c_str());
  439 
  440             r->appendChild((DOMNode *)textNode);
  441             root->appendChild((DOMNode *)r);
  442         }
  443         catch (const XMLException& toCatch) {
  444             std::cout << "Exception message is: \n" << XMLString::transcode(toCatch.getMessage()) << std::endl;
  445         }
  446         catch (const DOMException& toCatch) {
  447             std::cout << "Exception message is: \n" << XMLString::transcode(toCatch.getMessage()) << std::endl;
  448         }
  449         catch (...) {
  450             std::cout << "Unexpected Exception" << std::endl; 
  451         }
  452 
  453         insbuf.clear();
  454 
  455     } else if (currop == XyStrDiff::DEL) {
  456 
  457         len = delbuf.length();
  458         startpos = xpos - len;
  459 
  460         try {
  461             XMLString::transcode("td", tempStrA, 99);
  462             DOMElement *r = doc->createElement(tempStrA);
  463             XMLString::transcode("pos", tempStrA, 99);
  464             XMLString::transcode(itoa(startpos).c_str(), tempStrB, 99);
  465             r->setAttribute(tempStrA, tempStrB);
  466             XMLString::transcode("len", tempStrA, 99);
  467             XMLString::transcode(itoa(len).c_str(), tempStrB, 99);
  468             r->setAttribute(tempStrA, tempStrB);        
  469             root->appendChild((DOMNode *)r);
  470         }
  471         catch (const XMLException& toCatch) {
  472             std::cout << "Exception message is: \n" << XMLString::transcode(toCatch.getMessage()) << std::endl;
  473         }
  474         catch (const DOMException& toCatch) {
  475             std::cout << "Exception message is: \n" << XMLString::transcode(toCatch.getMessage()) << std::endl;
  476         }
  477         catch (...) {
  478             std::cout << "Unexpected Exception" << std::endl;
  479         }
  480     delbuf.clear();
  481     }
  482 }