View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   *
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights.
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   *
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  /**
57   * Clean up misuse of presentation markup. Filters from other formats such as Microsoft Word often make excessive use of
58   * presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is
59   * straight forward to transform this to use CSS. Some rules replace some of the children of an element by style
60   * properties on the element, e.g.
61   * <p>
62   * <b>... </b>
63   * </p>.
64   * <p style="font-weight: bold">
65   * ...
66   * </p>
67   * Such rules are applied to the element's content and then to the element itself until none of the rules more apply.
68   * Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules
69   * strip the element they apply to, replacing it by style properties on the contents, e.g. <dir>
70   * <li>
71   * <p>
72   * ...</li>
73   * </dir>.
74   * <p style="margin-left 1em">
75   * ... These rules are applied to an element before processing its content and replace the current element by the first
76   * element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class
77   * value and style rule in the document head. To support this, an association of styles and class names is built. A
78   * naive approach is to rely on string matching to test when two property lists are the same. A better approach would be
79   * to first sort the properties before matching.
80   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
81   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
82   * @author Fabrizio Giustina
83   * @version $Revision: 802 $ ($Author: fgiust $)
84   */
85  public class Clean
86  {
87  
88      /**
89       * sequential number for generated css classes.
90       */
91      private int classNum = 1;
92  
93      /**
94       * Tag table.
95       */
96      private TagTable tt;
97  
98      /**
99       * Instantiates a new Clean.
100      * @param tagTable tag table instance
101      */
102     public Clean(TagTable tagTable)
103     {
104         this.tt = tagTable;
105     }
106 
107     /**
108      * Insert a css style property.
109      * @param props StyleProp instance
110      * @param name property name
111      * @param value property value
112      * @return StyleProp containin the given property
113      */
114     private StyleProp insertProperty(StyleProp props, String name, String value)
115     {
116         StyleProp first, prev, prop;
117         int cmp;
118 
119         prev = null;
120         first = props;
121 
122         while (props != null)
123         {
124             cmp = props.name.compareTo(name);
125 
126             if (cmp == 0)
127             {
128                 // this property is already defined, ignore new value
129                 return first;
130             }
131 
132             if (cmp > 0) // props.name > name
133             {
134                 // insert before this
135 
136                 prop = new StyleProp(name, value, props);
137 
138                 if (prev != null)
139                 {
140                     prev.next = prop;
141                 }
142                 else
143                 {
144                     first = prop;
145                 }
146 
147                 return first;
148             }
149 
150             prev = props;
151             props = props.next;
152         }
153 
154         prop = new StyleProp(name, value, null);
155 
156         if (prev != null)
157         {
158             prev.next = prop;
159         }
160         else
161         {
162             first = prop;
163         }
164 
165         return first;
166     }
167 
168     /**
169      * Create sorted linked list of properties from style string.
170      * @param prop StyleProp
171      * @param style style string
172      * @return StyleProp with given style
173      */
174     private StyleProp createProps(StyleProp prop, String style)
175     {
176         int nameEnd;
177         int valueEnd;
178         int valueStart = 0;
179         int nameStart = 0;
180         boolean more;
181 
182         nameStart = 0;
183         while (nameStart < style.length())
184         {
185             while (nameStart < style.length() && style.charAt(nameStart) == ' ')
186             {
187                 ++nameStart;
188             }
189 
190             nameEnd = nameStart;
191 
192             while (nameEnd < style.length())
193             {
194                 if (style.charAt(nameEnd) == ':')
195                 {
196                     valueStart = nameEnd + 1;
197                     break;
198                 }
199 
200                 ++nameEnd;
201             }
202 
203             if (nameEnd >= style.length() || style.charAt(nameEnd) != ':')
204             {
205                 break;
206             }
207 
208             while (valueStart < style.length() && style.charAt(valueStart) == ' ')
209             {
210                 ++valueStart;
211             }
212 
213             valueEnd = valueStart;
214             more = false;
215 
216             while (valueEnd < style.length())
217             {
218                 if (style.charAt(valueEnd) == ';')
219                 {
220                     more = true;
221                     break;
222                 }
223 
224                 ++valueEnd;
225             }
226 
227             prop = insertProperty(prop, style.substring(nameStart, nameEnd), style.substring(valueStart, valueEnd));
228 
229             if (more)
230             {
231                 nameStart = valueEnd + 1;
232                 continue;
233             }
234 
235             break;
236         }
237 
238         return prop;
239     }
240 
241     /**
242      * Create a css property.
243      * @param props StyleProp
244      * @return css property as String
245      */
246     private String createPropString(StyleProp props)
247     {
248         String style = "";
249         int len;
250         StyleProp prop;
251 
252         // compute length
253         for (len = 0, prop = props; prop != null; prop = prop.next)
254         {
255             len += prop.name.length() + 2;
256             len += prop.value.length() + 2;
257         }
258 
259         for (prop = props; prop != null; prop = prop.next)
260         {
261             style = style.concat(prop.name);
262             style = style.concat(": ");
263 
264             style = style.concat(prop.value);
265 
266             if (prop.next == null)
267             {
268                 break;
269             }
270 
271             style = style.concat("; ");
272         }
273 
274         return style;
275     }
276 
277     /**
278      * Creates a string with merged properties.
279      * @param style css style
280      * @param property css properties
281      * @return merged string
282      */
283     private String addProperty(String style, String property)
284     {
285         StyleProp prop;
286 
287         prop = createProps(null, style);
288         prop = createProps(prop, property);
289         style = createPropString(prop);
290         return style;
291     }
292 
293     /**
294      * Generates a new css class name.
295      * @param lexer Lexer
296      * @param tag Tag
297      * @return generated css class
298      */
299     private String gensymClass(Lexer lexer, String tag)
300     {
301         String str;
302 
303         str = lexer.configuration.cssPrefix == null ? lexer.configuration.cssPrefix + this.classNum : "c"
304             + this.classNum;
305         this.classNum++;
306         return str;
307     }
308 
309     /**
310      * Finds a css style.
311      * @param lexer Lexer
312      * @param tag tag name
313      * @param properties css properties
314      * @return style string
315      */
316     private String findStyle(Lexer lexer, String tag, String properties)
317     {
318         Style style;
319 
320         for (style = lexer.styles; style != null; style = style.next)
321         {
322             if (style.tag.equals(tag) && style.properties.equals(properties))
323             {
324                 return style.tagClass;
325             }
326         }
327 
328         style = new Style(tag, gensymClass(lexer, tag), properties, lexer.styles);
329         lexer.styles = style;
330         return style.tagClass;
331     }
332 
333     /**
334      * Find style attribute in node, and replace it by corresponding class attribute. Search for class in style
335      * dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute.
336      * @param lexer Lexer
337      * @param node node with a style attribute
338      */
339     private void style2Rule(Lexer lexer, Node node)
340     {
341         AttVal styleattr, classattr;
342         String classname;
343 
344         styleattr = node.getAttrByName("style");
345 
346         if (styleattr != null)
347         {
348             classname = findStyle(lexer, node.element, styleattr.value);
349             classattr = node.getAttrByName("class");
350 
351             // if there already is a class attribute then append class name after a space
352 
353             if (classattr != null)
354             {
355                 classattr.value = classattr.value + " " + classname;
356                 node.removeAttribute(styleattr);
357             }
358             else
359             {
360                 // reuse style attribute for class attribute
361                 styleattr.attribute = "class";
362                 styleattr.value = classname;
363             }
364         }
365     }
366 
367     /**
368      * Adds a css rule for color.
369      * @param lexer Lexer
370      * @param selector css selector
371      * @param color color value
372      */
373     private void addColorRule(Lexer lexer, String selector, String color)
374     {
375         if (color != null)
376         {
377             lexer.addStringLiteral(selector);
378             lexer.addStringLiteral(" { color: ");
379             lexer.addStringLiteral(color);
380             lexer.addStringLiteral(" }\n");
381         }
382     }
383 
384     /**
385      * Move presentation attribs from body to style element.
386      * 
387      * <pre>
388      * background="foo" . body { background-image: url(foo) }
389      * bgcolor="foo" . body { background-color: foo }
390      * text="foo" . body { color: foo }
391      * link="foo" . :link { color: foo }
392      * vlink="foo" . :visited { color: foo }
393      * alink="foo" . :active { color: foo }
394      * </pre>
395      * 
396      * @param lexer Lexer
397      * @param body body node
398      */
399     private void cleanBodyAttrs(Lexer lexer, Node body)
400     {
401         AttVal attr;
402         String bgurl = null;
403         String bgcolor = null;
404         String color = null;
405 
406         attr = body.getAttrByName("background");
407 
408         if (attr != null)
409         {
410             bgurl = attr.value;
411             attr.value = null;
412             body.removeAttribute(attr);
413         }
414 
415         attr = body.getAttrByName("bgcolor");
416 
417         if (attr != null)
418         {
419             bgcolor = attr.value;
420             attr.value = null;
421             body.removeAttribute(attr);
422         }
423 
424         attr = body.getAttrByName("text");
425 
426         if (attr != null)
427         {
428             color = attr.value;
429             attr.value = null;
430             body.removeAttribute(attr);
431         }
432 
433         if (bgurl != null || bgcolor != null || color != null)
434         {
435             lexer.addStringLiteral(" body {\n");
436 
437             if (bgurl != null)
438             {
439                 lexer.addStringLiteral("  background-image: url(");
440                 lexer.addStringLiteral(bgurl);
441                 lexer.addStringLiteral(");\n");
442             }
443 
444             if (bgcolor != null)
445             {
446                 lexer.addStringLiteral("  background-color: ");
447                 lexer.addStringLiteral(bgcolor);
448                 lexer.addStringLiteral(";\n");
449             }
450 
451             if (color != null)
452             {
453                 lexer.addStringLiteral("  color: ");
454                 lexer.addStringLiteral(color);
455                 lexer.addStringLiteral(";\n");
456             }
457 
458             lexer.addStringLiteral(" }\n");
459         }
460 
461         attr = body.getAttrByName("link");
462 
463         if (attr != null)
464         {
465             addColorRule(lexer, " :link", attr.value);
466             body.removeAttribute(attr);
467         }
468 
469         attr = body.getAttrByName("vlink");
470 
471         if (attr != null)
472         {
473             addColorRule(lexer, " :visited", attr.value);
474             body.removeAttribute(attr);
475         }
476 
477         attr = body.getAttrByName("alink");
478 
479         if (attr != null)
480         {
481             addColorRule(lexer, " :active", attr.value);
482             body.removeAttribute(attr);
483         }
484     }
485 
486     /**
487      * Check deprecated attributes in body tag.
488      * @param lexer Lexer
489      * @param doc document root node
490      * @return <code>true</code> is the body doesn't contain deprecated attributes, false otherwise.
491      */
492     private boolean niceBody(Lexer lexer, Node doc)
493     {
494         Node body = doc.findBody(lexer.configuration.tt);
495 
496         if (body != null)
497         {
498             if (body.getAttrByName("background") != null
499                 || body.getAttrByName("bgcolor") != null
500                 || body.getAttrByName("text") != null
501                 || body.getAttrByName("link") != null
502                 || body.getAttrByName("vlink") != null
503                 || body.getAttrByName("alink") != null)
504             {
505                 lexer.badLayout |= Report.USING_BODY;
506                 return false;
507             }
508         }
509 
510         return true;
511     }
512 
513     /**
514      * Create style element using rules from dictionary.
515      * @param lexer Lexer
516      * @param doc root node
517      */
518     private void createStyleElement(Lexer lexer, Node doc)
519     {
520         Node node, head, body;
521         Style style;
522         AttVal av;
523 
524         if (lexer.styles == null && niceBody(lexer, doc))
525         {
526             return;
527         }
528 
529         node = lexer.newNode(Node.START_TAG, null, 0, 0, "style");
530         node.implicit = true;
531 
532         // insert type attribute
533         av = new AttVal(null, null, '"', "type", "text/css");
534         av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
535         node.attributes = av;
536 
537         body = doc.findBody(lexer.configuration.tt);
538 
539         lexer.txtstart = lexer.lexsize;
540 
541         if (body != null)
542         {
543             cleanBodyAttrs(lexer, body);
544         }
545 
546         for (style = lexer.styles; style != null; style = style.next)
547         {
548             lexer.addCharToLexer(' ');
549             lexer.addStringLiteral(style.tag);
550             lexer.addCharToLexer('.');
551             lexer.addStringLiteral(style.tagClass);
552             lexer.addCharToLexer(' ');
553             lexer.addCharToLexer('{');
554             lexer.addStringLiteral(style.properties);
555             lexer.addCharToLexer('}');
556             lexer.addCharToLexer('\n');
557         }
558 
559         lexer.txtend = lexer.lexsize;
560 
561         node.insertNodeAtEnd(lexer.newNode(Node.TEXT_NODE, lexer.lexbuf, lexer.txtstart, lexer.txtend));
562 
563         // now insert style element into document head doc is root node. search its children for html node the head
564         // node should be first child of html node
565 
566         head = doc.findHEAD(lexer.configuration.tt);
567 
568         if (head != null)
569         {
570             head.insertNodeAtEnd(node);
571         }
572     }
573 
574     /**
575      * Ensure bidirectional links are consistent.
576      * @param node root node
577      */
578     private void fixNodeLinks(Node node)
579     {
580         Node child;
581 
582         if (node.prev != null)
583         {
584             node.prev.next = node;
585         }
586         else
587         {
588             node.parent.content = node;
589         }
590 
591         if (node.next != null)
592         {
593             node.next.prev = node;
594         }
595         else
596         {
597             node.parent.last = node;
598         }
599 
600         for (child = node.content; child != null; child = child.next)
601         {
602             child.parent = node;
603         }
604     }
605 
606     /**
607      * Used to strip child of node when the node has one and only one child.
608      * @param node parent node
609      */
610     private void stripOnlyChild(Node node)
611     {
612         Node child;
613 
614         child = node.content;
615         node.content = child.content;
616         node.last = child.last;
617         child.content = null;
618 
619         for (child = node.content; child != null; child = child.next)
620         {
621             child.parent = node;
622         }
623     }
624 
625     /**
626      * Used to strip font start and end tags.
627      * @param element original node
628      * @param pnode passed in as array to allow modification. pnode[0] will contain the final node
629      * @todo remove the pnode parameter and make it a return value
630      */
631     private void discardContainer(Node element, Node[] pnode)
632     {
633         Node node;
634         Node parent = element.parent;
635 
636         if (element.content != null)
637         {
638             element.last.next = element.next;
639 
640             if (element.next != null)
641             {
642                 element.next.prev = element.last;
643                 element.last.next = element.next;
644             }
645             else
646             {
647                 parent.last = element.last;
648             }
649 
650             if (element.prev != null)
651             {
652                 element.content.prev = element.prev;
653                 element.prev.next = element.content;
654             }
655             else
656             {
657                 parent.content = element.content;
658             }
659 
660             for (node = element.content; node != null; node = node.next)
661             {
662                 node.parent = parent;
663             }
664 
665             pnode[0] = element.content;
666         }
667         else
668         {
669             if (element.next != null)
670             {
671                 element.next.prev = element.prev;
672             }
673             else
674             {
675                 parent.last = element.prev;
676             }
677 
678             if (element.prev != null)
679             {
680                 element.prev.next = element.next;
681             }
682             else
683             {
684                 parent.content = element.next;
685             }
686 
687             pnode[0] = element.next;
688         }
689 
690         element.next = null;
691         element.content = null;
692     }
693 
694     /**
695      * Add style property to element, creating style attribute as needed and adding ; delimiter.
696      * @param node node
697      * @param property property added to node
698      */
699     private void addStyleProperty(Node node, String property)
700     {
701         AttVal av;
702 
703         for (av = node.attributes; av != null; av = av.next)
704         {
705             if (av.attribute.equals("style"))
706             {
707                 break;
708             }
709         }
710 
711         // if style attribute already exists then insert property
712 
713         if (av != null)
714         {
715             String s;
716 
717             s = addProperty(av.value, property);
718             av.value = s;
719         }
720         else
721         {
722             // else create new style attribute
723             av = new AttVal(node.attributes, null, '"', "style", property);
724             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
725             node.attributes = av;
726         }
727     }
728 
729     /**
730      * Create new string that consists of the combined style properties in s1 and s2. To merge property lists, we build
731      * a linked list of property/values and insert properties into the list in order, merging values for the same
732      * property name.
733      * @param s1 first property
734      * @param s2 second property
735      * @return merged properties
736      */
737     private String mergeProperties(String s1, String s2)
738     {
739         String s;
740         StyleProp prop;
741 
742         prop = createProps(null, s1);
743         prop = createProps(prop, s2);
744         s = createPropString(prop);
745         return s;
746     }
747 
748     /**
749      * Merge class attributes from 2 nodes.
750      * @param node Node
751      * @param child Child node
752      */
753     private void mergeClasses(Node node, Node child)
754     {
755         AttVal av;
756         String s1, s2, names;
757 
758         for (s2 = null, av = child.attributes; av != null; av = av.next)
759         {
760             if ("class".equals(av.attribute))
761             {
762                 s2 = av.value;
763                 break;
764             }
765         }
766 
767         for (s1 = null, av = node.attributes; av != null; av = av.next)
768         {
769             if ("class".equals(av.attribute))
770             {
771                 s1 = av.value;
772                 break;
773             }
774         }
775 
776         if (s1 != null)
777         {
778             if (s2 != null) // merge class names from both
779             {
780                 names = s1 + ' ' + s2;
781                 av.value = names;
782             }
783         }
784         else if (s2 != null) // copy class names from child
785         {
786             av = new AttVal(node.attributes, null, '"', "class", s2);
787             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
788             node.attributes = av;
789         }
790     }
791 
792     /**
793      * Merge style from 2 nodes.
794      * @param node Node
795      * @param child Child node
796      */
797     private void mergeStyles(Node node, Node child)
798     {
799         AttVal av;
800         String s1, s2, style;
801 
802         // the child may have a class attribute used for attaching styles, if so the class name needs to be copied to
803         // node's class
804         mergeClasses(node, child);
805 
806         for (s2 = null, av = child.attributes; av != null; av = av.next)
807         {
808             if (av.attribute.equals("style"))
809             {
810                 s2 = av.value;
811                 break;
812             }
813         }
814 
815         for (s1 = null, av = node.attributes; av != null; av = av.next)
816         {
817             if (av.attribute.equals("style"))
818             {
819                 s1 = av.value;
820                 break;
821             }
822         }
823 
824         if (s1 != null)
825         {
826             if (s2 != null) // merge styles from both
827             {
828                 style = mergeProperties(s1, s2);
829                 av.value = style;
830             }
831         }
832         else if (s2 != null) // copy style of child
833         {
834             av = new AttVal(node.attributes, null, '"', "style", s2);
835             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
836             node.attributes = av;
837         }
838     }
839 
840     /**
841      * Map a % font size to a named font size.
842      * @param size size in %
843      * @return font size name
844      */
845     private String fontSize2Name(String size)
846     {
847         String[] sizes = {"60%", "70%", "80%", null, "120%", "150%", "200%"};
848         String buf;
849 
850         if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6')
851         {
852             int n = size.charAt(0) - '0';
853             return sizes[n];
854         }
855 
856         if (size.length() > 0 && size.charAt(0) == '-')
857         {
858             if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
859             {
860                 int n = size.charAt(1) - '0';
861                 double x;
862 
863                 for (x = 1.0; n > 0; --n)
864                 {
865                     x *= 0.8;
866                 }
867 
868                 x *= 100.0;
869                 buf = "" + (int) x + "%";
870 
871                 return buf;
872             }
873 
874             return "smaller"; /* "70%"; */
875         }
876 
877         if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
878         {
879             int n = size.charAt(1) - '0';
880             double x;
881 
882             for (x = 1.0; n > 0; --n)
883             {
884                 x *= 1.2;
885             }
886 
887             x *= 100.0;
888             buf = "" + (int) x + "%";
889 
890             return buf;
891         }
892 
893         return "larger"; /* "140%" */
894     }
895 
896     /**
897      * Adds a font-family style.
898      * @param node Node
899      * @param face font face
900      */
901     private void addFontFace(Node node, String face)
902     {
903         addStyleProperty(node, "font-family: " + face);
904     }
905 
906     /**
907      * Adds a font size style.
908      * @param node Node
909      * @param size font size
910      */
911     private void addFontSize(Node node, String size)
912     {
913         if (size == null)
914         {
915             return;
916         }
917 
918         if ("6".equals(size) && node.tag == this.tt.tagP)
919         {
920             node.element = "h1";
921             this.tt.findTag(node);
922             return;
923         }
924 
925         if ("5".equals(size) && node.tag == this.tt.tagP)
926         {
927             node.element = "h2";
928             this.tt.findTag(node);
929             return;
930         }
931 
932         if ("4".equals(size) && node.tag == this.tt.tagP)
933         {
934             node.element = "h3";
935             this.tt.findTag(node);
936             return;
937         }
938 
939         String value = fontSize2Name(size);
940 
941         if (value != null)
942         {
943             addStyleProperty(node, "font-size: " + value);
944         }
945     }
946 
947     /**
948      * Adds a font color style.
949      * @param node Node
950      * @param color color value
951      */
952     private void addFontColor(Node node, String color)
953     {
954         addStyleProperty(node, "color: " + color);
955     }
956 
957     /**
958      * Adds an align style.
959      * @param node Node
960      * @param align align value
961      */
962     private void addAlign(Node node, String align)
963     {
964         // force alignment value to lower case
965         addStyleProperty(node, "text-align: " + align.toLowerCase());
966     }
967 
968     /**
969      * Add style properties to node corresponding to the font face, size and color attributes.
970      * @param node font tag
971      * @param av attribute list for node
972      */
973     private void addFontStyles(Node node, AttVal av)
974     {
975         while (av != null)
976         {
977             if (av.attribute.equals("face"))
978             {
979                 addFontFace(node, av.value);
980             }
981             else if (av.attribute.equals("size"))
982             {
983                 addFontSize(node, av.value);
984             }
985             else if (av.attribute.equals("color"))
986             {
987                 addFontColor(node, av.value);
988             }
989 
990             av = av.next;
991         }
992     }
993 
994     /**
995      * Symptom: <code>&lt;p align=center></code>. Action: <code>&lt;p style="text-align: center"></code>.
996      * @param lexer Lexer
997      * @param node node with center attribute. Will be modified to use css style.
998      */
999     private void textAlign(Lexer lexer, Node node)
1000     {
1001         AttVal av, prev;
1002 
1003         prev = null;
1004 
1005         for (av = node.attributes; av != null; av = av.next)
1006         {
1007             if (av.attribute.equals("align"))
1008             {
1009                 if (prev != null)
1010                 {
1011                     prev.next = av.next;
1012                 }
1013                 else
1014                 {
1015                     node.attributes = av.next;
1016                 }
1017 
1018                 if (av.value != null)
1019                 {
1020                     addAlign(node, av.value);
1021                 }
1022 
1023                 break;
1024             }
1025 
1026             prev = av;
1027         }
1028     }
1029 
1030     /**
1031      * Symptom: <code>&lt;dir>&lt;li></code> where <code>&lt;li></code> is only child. Action: coerce
1032      * <code>&lt;dir> &lt;li></code> to <code>&lt;div></code> with indent. The clean up rules use the pnode argument
1033      * to return the next node when the original node has been deleted.
1034      * @param lexer Lexer
1035      * @param node dir tag
1036      * @return <code>true</code> if a dir tag has been coerced to a div
1037      */
1038     private boolean dir2Div(Lexer lexer, Node node)
1039     {
1040         Node child;
1041 
1042         if (node.tag == this.tt.tagDir || node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1043         {
1044             child = node.content;
1045 
1046             if (child == null)
1047             {
1048                 return false;
1049             }
1050 
1051             // check child has no peers
1052             if (child.next != null)
1053             {
1054                 return false;
1055             }
1056 
1057             if (child.tag != this.tt.tagLi)
1058             {
1059                 return false;
1060             }
1061 
1062             if (!child.implicit)
1063             {
1064                 return false;
1065             }
1066 
1067             // coerce dir to div
1068             node.tag = this.tt.tagDiv;
1069             node.element = "div";
1070             addStyleProperty(node, "margin-left: 2em");
1071             stripOnlyChild(node);
1072             return true;
1073         }
1074 
1075         return false;
1076     }
1077 
1078     /**
1079      * Symptom:
1080      * 
1081      * <pre>
1082      * &lt;center>
1083      * </pre>.
1084      * <p>
1085      * Action: replace <code>&lt;center></code> by <code>&lt;div style="text-align: center"></code>
1086      * </p>
1087      * @param lexer Lexer
1088      * @param node center tag
1089      * @param pnode pnode[0] is the same as node, passed in as an array to allow modification
1090      * @return <code>true</code> if a center tag has been replaced by a div
1091      */
1092     private boolean center2Div(Lexer lexer, Node node, Node[] pnode)
1093     {
1094         if (node.tag == this.tt.tagCenter)
1095         {
1096             if (lexer.configuration.dropFontTags)
1097             {
1098                 if (node.content != null)
1099                 {
1100                     Node last = node.last;
1101                     Node parent = node.parent;
1102 
1103                     discardContainer(node, pnode);
1104 
1105                     node = lexer.inferredTag("br");
1106 
1107                     if (last.next != null)
1108                     {
1109                         last.next.prev = node;
1110                     }
1111 
1112                     node.next = last.next;
1113                     last.next = node;
1114                     node.prev = last;
1115 
1116                     if (parent.last == last)
1117                     {
1118                         parent.last = node;
1119                     }
1120 
1121                     node.parent = parent;
1122                 }
1123                 else
1124                 {
1125                     Node prev = node.prev;
1126                     Node next = node.next;
1127                     Node parent = node.parent;
1128                     discardContainer(node, pnode);
1129 
1130                     node = lexer.inferredTag("br");
1131                     node.next = next;
1132                     node.prev = prev;
1133                     node.parent = parent;
1134 
1135                     if (next != null)
1136                     {
1137                         next.prev = node;
1138                     }
1139                     else
1140                     {
1141                         parent.last = node;
1142                     }
1143 
1144                     if (prev != null)
1145                     {
1146                         prev.next = node;
1147                     }
1148                     else
1149                     {
1150                         parent.content = node;
1151                     }
1152                 }
1153 
1154                 return true;
1155             }
1156             node.tag = this.tt.tagDiv;
1157             node.element = "div";
1158             addStyleProperty(node, "text-align: center");
1159             return true;
1160         }
1161 
1162         return false;
1163     }
1164 
1165     /**
1166      * Symptom: <code>&lt;div>&lt;div>...&lt;/div>&lt;/div></code> Action: merge the two divs. This is useful after
1167      * nested &lt;dir>s used by Word for indenting have been converted to &lt;div>s.
1168      * @param lexer Lexer
1169      * @param node first div
1170      * @return true if the divs have been merged
1171      */
1172     private boolean mergeDivs(Lexer lexer, Node node)
1173     {
1174         Node child;
1175 
1176         if (node.tag != this.tt.tagDiv)
1177         {
1178             return false;
1179         }
1180 
1181         child = node.content;
1182 
1183         if (child == null)
1184         {
1185             return false;
1186         }
1187 
1188         if (child.tag != this.tt.tagDiv)
1189         {
1190             return false;
1191         }
1192 
1193         if (child.next != null)
1194         {
1195             return false;
1196         }
1197 
1198         mergeStyles(node, child);
1199         stripOnlyChild(node);
1200         return true;
1201     }
1202 
1203     /**
1204      * Symptom:
1205      * <ul>
1206      * <li>
1207      * <ul>
1208      * ...
1209      * </ul>
1210      * </li>
1211      * </ul>
1212      * Action: discard outer list.
1213      * @param lexer Lexer
1214      * @param node Node
1215      * @param pnode passed in as array to allow modifications.
1216      * @return <code>true</code> if nested lists have been found and replaced
1217      */
1218     private boolean nestedList(Lexer lexer, Node node, Node[] pnode)
1219     {
1220         Node child, list;
1221 
1222         if (node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1223         {
1224             child = node.content;
1225 
1226             if (child == null)
1227             {
1228                 return false;
1229             }
1230 
1231             // check child has no peers
1232 
1233             if (child.next != null)
1234             {
1235                 return false;
1236             }
1237 
1238             list = child.content;
1239 
1240             if (list == null)
1241             {
1242                 return false;
1243             }
1244 
1245             if (list.tag != node.tag)
1246             {
1247                 return false;
1248             }
1249 
1250             pnode[0] = list; // Set node to resume iteration
1251 
1252             // move inner list node into position of outer node
1253             list.prev = node.prev;
1254             list.next = node.next;
1255             list.parent = node.parent;
1256             fixNodeLinks(list);
1257 
1258             // get rid of outer ul and its li
1259             // XXX: Are we leaking the child node? -creitzel 7 Jun, 01
1260             child.content = null;
1261             node.content = null;
1262             node.next = null;
1263             node = null;
1264 
1265             // If prev node was a list the chances are this node should be appended to that list. Word has no way of
1266             // recognizing nested lists and just uses indents
1267             if (list.prev != null)
1268             {
1269                 if (list.prev.tag == this.tt.tagUl || list.prev.tag == this.tt.tagOl)
1270                 {
1271 
1272                     node = list;
1273                     list = node.prev;
1274 
1275                     list.next = node.next;
1276 
1277                     if (list.next != null)
1278                     {
1279                         list.next.prev = list;
1280                     }
1281 
1282                     child = list.last; /* <li> */
1283 
1284                     node.parent = child;
1285                     node.next = null;
1286                     node.prev = child.last;
1287                     fixNodeLinks(node);
1288                     cleanNode(lexer, node);
1289                 }
1290             }
1291 
1292             return true;
1293         }
1294 
1295         return false;
1296     }
1297 
1298     /**
1299      * Symptom: the only child of a block-level element is a presentation element such as B, I or FONT. Action: add
1300      * style "font-weight: bold" to the block and strip the &lt;b>element, leaving its children. example:
1301      * 
1302      * <pre>
1303      * &lt;p>
1304      * &lt;b>&lt;font face="Arial" size="6">Draft Recommended Practice&lt;/font>&lt;/b>
1305      * &lt;/p>
1306      * </pre>
1307      * 
1308      * becomes:
1309      * 
1310      * <pre>
1311      * &lt;p style="font-weight: bold; font-family: Arial; font-size: 6">
1312      * Draft Recommended Practice
1313      * &lt;/p>
1314      * </pre>
1315      * 
1316      * <p>
1317      * This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator
1318      * 4, this isn't done for the elements: caption, tr and table
1319      * </p>
1320      * @param lexer Lexer
1321      * @param node parent node
1322      * @return <code>true</code> if the child node has been removed
1323      */
1324     private boolean blockStyle(Lexer lexer, Node node)
1325     {
1326         Node child;
1327 
1328         if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1329         {
1330             if (node.tag != this.tt.tagTable && node.tag != this.tt.tagTr && node.tag != this.tt.tagLi)
1331             {
1332                 // check for align attribute
1333                 if (node.tag != this.tt.tagCaption)
1334                 {
1335                     textAlign(lexer, node);
1336                 }
1337 
1338                 child = node.content;
1339 
1340                 if (child == null)
1341                 {
1342                     return false;
1343                 }
1344 
1345                 // check child has no peers
1346                 if (child.next != null)
1347                 {
1348                     return false;
1349                 }
1350 
1351                 if (child.tag == this.tt.tagB)
1352                 {
1353                     mergeStyles(node, child);
1354                     addStyleProperty(node, "font-weight: bold");
1355                     stripOnlyChild(node);
1356                     return true;
1357                 }
1358 
1359                 if (child.tag == this.tt.tagI)
1360                 {
1361                     mergeStyles(node, child);
1362                     addStyleProperty(node, "font-style: italic");
1363                     stripOnlyChild(node);
1364                     return true;
1365                 }
1366 
1367                 if (child.tag == this.tt.tagFont)
1368                 {
1369                     mergeStyles(node, child);
1370                     addFontStyles(node, child.attributes);
1371                     stripOnlyChild(node);
1372                     return true;
1373                 }
1374             }
1375         }
1376 
1377         return false;
1378     }
1379 
1380     /**
1381      * If the node has only one b, i, or font child remove the child node and add the appropriate style attributes to
1382      * parent.
1383      * @param lexer Lexer
1384      * @param node parent node
1385      * @param pnode passed as an array to allow modifications
1386      * @return <code>true</code> if child node has been stripped, replaced by style attributes.
1387      */
1388     private boolean inlineStyle(Lexer lexer, Node node, Node[] pnode)
1389     {
1390         Node child;
1391 
1392         if (node.tag != this.tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0)
1393         {
1394             child = node.content;
1395 
1396             if (child == null)
1397             {
1398                 return false;
1399             }
1400 
1401             // check child has no peers
1402             if (child.next != null)
1403             {
1404                 return false;
1405             }
1406 
1407             if (child.tag == this.tt.tagB && lexer.configuration.logicalEmphasis)
1408             {
1409                 mergeStyles(node, child);
1410                 addStyleProperty(node, "font-weight: bold");
1411                 stripOnlyChild(node);
1412                 return true;
1413             }
1414 
1415             if (child.tag == this.tt.tagI && lexer.configuration.logicalEmphasis)
1416             {
1417                 mergeStyles(node, child);
1418                 addStyleProperty(node, "font-style: italic");
1419                 stripOnlyChild(node);
1420                 return true;
1421             }
1422 
1423             if (child.tag == this.tt.tagFont)
1424             {
1425                 mergeStyles(node, child);
1426                 addFontStyles(node, child.attributes);
1427                 stripOnlyChild(node);
1428                 return true;
1429             }
1430         }
1431 
1432         return false;
1433     }
1434 
1435     /**
1436      * Replace font elements by span elements, deleting the font element's attributes and replacing them by a single
1437      * style attribute.
1438      * @param lexer Lexer
1439      * @param node font tag
1440      * @param pnode passed as an array to allow modifications
1441      * @return <code>true</code> if a font tag has been dropped and replaced by style attributes
1442      */
1443     private boolean font2Span(Lexer lexer, Node node, Node[] pnode)
1444     {
1445         AttVal av, style, next;
1446 
1447         if (node.tag == this.tt.tagFont)
1448         {
1449             if (lexer.configuration.dropFontTags)
1450             {
1451                 discardContainer(node, pnode);
1452                 return false;
1453             }
1454 
1455             // if FONT is only child of parent element then leave alone
1456             if (node.parent.content == node && node.next == null)
1457             {
1458                 return false;
1459             }
1460 
1461             addFontStyles(node, node.attributes);
1462 
1463             // extract style attribute and free the rest
1464             av = node.attributes;
1465             style = null;
1466 
1467             while (av != null)
1468             {
1469                 next = av.next;
1470 
1471                 if (av.attribute.equals("style"))
1472                 {
1473                     av.next = null;
1474                     style = av;
1475                 }
1476 
1477                 av = next;
1478             }
1479 
1480             node.attributes = style;
1481 
1482             node.tag = this.tt.tagSpan;
1483             node.element = "span";
1484 
1485             return true;
1486         }
1487 
1488         return false;
1489     }
1490 
1491     /**
1492      * Applies all matching rules to a node.
1493      * @param lexer Lexer
1494      * @param node original node
1495      * @return cleaned up node
1496      */
1497     private Node cleanNode(Lexer lexer, Node node)
1498     {
1499         Node next = null;
1500         Node[] o = new Node[1];
1501         boolean b = false;
1502 
1503         for (next = node; node != null && node.isElement(); node = next)
1504         {
1505             o[0] = next;
1506 
1507             b = dir2Div(lexer, node);
1508             next = o[0];
1509             if (b)
1510             {
1511                 continue;
1512             }
1513 
1514             // Special case: true result means that arg node and its parent no longer exist.
1515             // So we must jump back up the CreateStyleProperties() call stack until we have a valid node reference.
1516             b = nestedList(lexer, node, o);
1517             next = o[0];
1518             if (b)
1519             {
1520                 return next;
1521             }
1522 
1523             b = center2Div(lexer, node, o);
1524             next = o[0];
1525             if (b)
1526             {
1527                 continue;
1528             }
1529 
1530             b = mergeDivs(lexer, node);
1531             next = o[0];
1532             if (b)
1533             {
1534                 continue;
1535             }
1536 
1537             b = blockStyle(lexer, node);
1538             next = o[0];
1539             if (b)
1540             {
1541                 continue;
1542             }
1543 
1544             b = inlineStyle(lexer, node, o);
1545             next = o[0];
1546             if (b)
1547             {
1548                 continue;
1549             }
1550 
1551             b = font2Span(lexer, node, o);
1552             next = o[0];
1553             if (b)
1554             {
1555                 continue;
1556             }
1557 
1558             break;
1559         }
1560 
1561         return next;
1562     }
1563 
1564     /**
1565      * Special case: if the current node is destroyed by CleanNode() lower in the tree, this node and its parent no
1566      * longer exist. So we must jump back up the CreateStyleProperties() call stack until we have a valid node
1567      * reference.
1568      * @param lexer Lexer
1569      * @param node Node
1570      * @param prepl passed in as array to allow modifications
1571      * @return cleaned Node
1572      */
1573     private Node createStyleProperties(Lexer lexer, Node node, Node[] prepl)
1574     {
1575         Node child = node.content;
1576 
1577         if (child != null)
1578         {
1579             Node[] repl = new Node[1];
1580             repl[0] = node;
1581             while (child != null)
1582             {
1583                 child = createStyleProperties(lexer, child, repl);
1584                 if (repl[0] != node)
1585                 {
1586                     return repl[0];
1587                 }
1588                 if (child != null)
1589                 {
1590                     child = child.next;
1591                 }
1592             }
1593         }
1594 
1595         return cleanNode(lexer, node);
1596     }
1597 
1598     /**
1599      * Find style attribute in node content, and replace it by corresponding class attribute.
1600      * @param lexer Lexer
1601      * @param node parent node
1602      */
1603     private void defineStyleRules(Lexer lexer, Node node)
1604     {
1605         Node child;
1606 
1607         if (node.content != null)
1608         {
1609             child = node.content;
1610             while (child != null)
1611             {
1612                 defineStyleRules(lexer, child);
1613                 child = child.next;
1614             }
1615         }
1616 
1617         style2Rule(lexer, node);
1618     }
1619 
1620     /**
1621      * Clean an html tree.
1622      * @param lexer Lexer
1623      * @param doc root node
1624      */
1625     public void cleanTree(Lexer lexer, Node doc)
1626     {
1627         Node[] repl = new Node[1];
1628         repl[0] = doc;
1629         doc = createStyleProperties(lexer, doc, repl);
1630 
1631         if (!lexer.configuration.makeClean)
1632         {
1633             defineStyleRules(lexer, doc);
1634             createStyleElement(lexer, doc);
1635         }
1636     }
1637 
1638     /**
1639      * simplifies <b><b>... </b> ... </b> etc.
1640      * @param node root Node
1641      */
1642     public void nestedEmphasis(Node node)
1643     {
1644         Node[] o = new Node[1];
1645         Node next;
1646 
1647         while (node != null)
1648         {
1649             next = node.next;
1650 
1651             if ((node.tag == this.tt.tagB || node.tag == this.tt.tagI)
1652                 && node.parent != null
1653                 && node.parent.tag == node.tag)
1654             {
1655                 // strip redundant inner element
1656                 o[0] = next;
1657                 discardContainer(node, o);
1658                 next = o[0];
1659                 node = next;
1660                 continue;
1661             }
1662 
1663             if (node.content != null)
1664             {
1665                 nestedEmphasis(node.content);
1666             }
1667 
1668             node = next;
1669         }
1670     }
1671 
1672     /**
1673      * Replace i by em and b by strong.
1674      * @param node root Node
1675      */
1676     public void emFromI(Node node)
1677     {
1678         while (node != null)
1679         {
1680             if (node.tag == this.tt.tagI)
1681             {
1682                 node.element = this.tt.tagEm.name;
1683                 node.tag = this.tt.tagEm;
1684             }
1685             else if (node.tag == this.tt.tagB)
1686             {
1687                 node.element = this.tt.tagStrong.name;
1688                 node.tag = this.tt.tagStrong;
1689             }
1690 
1691             if (node.content != null)
1692             {
1693                 emFromI(node.content);
1694             }
1695 
1696             node = node.next;
1697         }
1698     }
1699 
1700     /**
1701      * Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single
1702      * implicit li. This is recursively replaced by an implicit blockquote.
1703      * @param node root Node
1704      */
1705     public void list2BQ(Node node)
1706     {
1707         while (node != null)
1708         {
1709             if (node.content != null)
1710             {
1711                 list2BQ(node.content);
1712             }
1713 
1714             if (node.tag != null
1715                 && node.tag.getParser() == ParserImpl.LIST
1716                 && node.hasOneChild()
1717                 && node.content.implicit)
1718             {
1719                 stripOnlyChild(node);
1720                 node.element = this.tt.tagBlockquote.name;
1721                 node.tag = this.tt.tagBlockquote;
1722                 node.implicit = true;
1723             }
1724 
1725             node = node.next;
1726         }
1727     }
1728 
1729     /**
1730      * Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with
1731      * the indent set to match the nesting depth.
1732      * @param node root Node
1733      */
1734     public void bQ2Div(Node node)
1735     {
1736         int indent;
1737         String indentBuf;
1738         AttVal attval;
1739 
1740         while (node != null)
1741         {
1742             if (node.tag == this.tt.tagBlockquote && node.implicit)
1743             {
1744                 indent = 1;
1745 
1746                 while (node.hasOneChild() && node.content.tag == this.tt.tagBlockquote && node.implicit)
1747                 {
1748                     ++indent;
1749                     stripOnlyChild(node);
1750                 }
1751 
1752                 if (node.content != null)
1753                 {
1754                     bQ2Div(node.content);
1755                 }
1756 
1757                 indentBuf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1758 
1759                 node.element = this.tt.tagDiv.name;
1760                 node.tag = this.tt.tagDiv;
1761 
1762                 attval = node.getAttrByName("style");
1763 
1764                 if (attval != null && attval.value != null)
1765                 {
1766                     attval.value = indentBuf + "; " + attval.value;
1767                 }
1768                 else
1769                 {
1770                     node.addAttribute("style", indentBuf);
1771                 }
1772             }
1773             else if (node.content != null)
1774             {
1775                 bQ2Div(node.content);
1776             }
1777 
1778             node = node.next;
1779         }
1780     }
1781 
1782     /**
1783      * Find the enclosing table cell for the given node.
1784      * @param node Node
1785      * @return enclosing cell node
1786      */
1787     Node findEnclosingCell(Node node)
1788     {
1789         Node check;
1790 
1791         for (check = node; check != null; check = check.parent)
1792         {
1793             if (check.tag == tt.tagTd)
1794             {
1795                 return check;
1796             }
1797         }
1798         return null;
1799     }
1800 
1801     /**
1802      * node is <code>&lt;![if ...]&gt;</code> prune up to <code>&lt;![endif]&gt;</code>.
1803      * @param lexer Lexer
1804      * @param node Node
1805      * @return cleaned up Node
1806      */
1807     public Node pruneSection(Lexer lexer, Node node)
1808     {
1809         for (;;)
1810         {
1811 
1812             // FG: commented out - don't add &nbsp; to empty cells
1813 
1814             // if ((Lexer.getString(node.textarray, node.start, 21)).equals("if !supportEmptyParas"))
1815             // {
1816             // Node cell = findEnclosingCell(node);
1817             // if (cell != null)
1818             // {
1819             // // Need to put &nbsp; into cell so it doesn't look weird
1820             // char onesixty[] = {(char) 160, (char) 0};
1821             // Node nbsp = lexer.newLiteralTextNode(lexer, onesixty);
1822             // Node.insertNodeBeforeElement(node, nbsp);
1823             // }
1824             // }
1825 
1826             // discard node and returns next
1827             node = Node.discardElement(node);
1828 
1829             if (node == null)
1830             {
1831                 return null;
1832             }
1833 
1834             if (node.type == Node.SECTION_TAG)
1835             {
1836                 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if"))
1837                 {
1838                     node = pruneSection(lexer, node);
1839                     continue;
1840                 }
1841 
1842                 if ((TidyUtils.getString(node.textarray, node.start, 5)).equals("endif"))
1843                 {
1844                     node = Node.discardElement(node);
1845                     break;
1846                 }
1847             }
1848         }
1849 
1850         return node;
1851     }
1852 
1853     /**
1854      * Drop if/endif sections inserted by word2000.
1855      * @param lexer Lexer
1856      * @param node Node root node
1857      */
1858     public void dropSections(Lexer lexer, Node node)
1859     {
1860         while (node != null)
1861         {
1862             if (node.type == Node.SECTION_TAG)
1863             {
1864                 // prune up to matching endif
1865                 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if")
1866                     && (!(TidyUtils.getString(node.textarray, node.start, 7)).equals("if !vml"))) // #444394 - fix 13
1867                 // Sep 01
1868                 {
1869                     node = pruneSection(lexer, node);
1870                     continue;
1871                 }
1872 
1873                 // discard others as well
1874                 node = Node.discardElement(node);
1875                 continue;
1876             }
1877 
1878             if (node.content != null)
1879             {
1880                 dropSections(lexer, node.content);
1881             }
1882 
1883             node = node.next;
1884         }
1885     }
1886 
1887     /**
1888      * Remove word2000 attributes from node.
1889      * @param node node to cleanup
1890      */
1891     public void purgeWord2000Attributes(Node node)
1892     {
1893         AttVal attr = null;
1894         AttVal next = null;
1895         AttVal prev = null;
1896 
1897         for (attr = node.attributes; attr != null; attr = next)
1898         {
1899             next = attr.next;
1900 
1901             // special check for class="Code" denoting pre text
1902             // Pass thru user defined styles as HTML class names
1903             if (attr.attribute != null && attr.value != null && attr.attribute.equals("class"))
1904             {
1905                 if (attr.value.equals("Code") || !attr.value.startsWith("Mso"))
1906                 {
1907                     prev = attr;
1908                     continue;
1909                 }
1910             }
1911 
1912             if (attr.attribute != null
1913                 && (attr.attribute.equals("class")
1914                     || attr.attribute.equals("style")
1915                     || attr.attribute.equals("lang")
1916                     || attr.attribute.startsWith("x:") || ((attr.attribute.equals("height") || attr.attribute
1917                     .equals("width")) && //
1918                 (node.tag == this.tt.tagTd || node.tag == this.tt.tagTr || node.tag == this.tt.tagTh))))
1919             {
1920                 if (prev != null)
1921                 {
1922                     prev.next = next;
1923                 }
1924                 else
1925                 {
1926                     node.attributes = next;
1927                 }
1928 
1929             }
1930             else
1931             {
1932                 prev = attr;
1933             }
1934         }
1935     }
1936 
1937     /**
1938      * Word2000 uses span excessively, so we strip span out.
1939      * @param lexer Lexer
1940      * @param span Node span
1941      * @return cleaned node
1942      */
1943     public Node stripSpan(Lexer lexer, Node span)
1944     {
1945         Node node;
1946         Node prev = null;
1947         Node content;
1948 
1949         // deal with span elements that have content by splicing the content in place of the span after having
1950         // processed it
1951 
1952         cleanWord2000(lexer, span.content);
1953         content = span.content;
1954 
1955         if (span.prev != null)
1956         {
1957             prev = span.prev;
1958         }
1959         else if (content != null)
1960         {
1961             node = content;
1962             content = content.next;
1963             node.removeNode();
1964             Node.insertNodeBeforeElement(span, node);
1965             prev = node;
1966         }
1967 
1968         while (content != null)
1969         {
1970             node = content;
1971             content = content.next;
1972             node.removeNode();
1973             prev.insertNodeAfterElement(node);
1974             prev = node;
1975         }
1976 
1977         if (span.next == null)
1978         {
1979             span.parent.last = prev;
1980         }
1981 
1982         node = span.next;
1983         span.content = null;
1984         Node.discardElement(span);
1985         return node;
1986     }
1987 
1988     /**
1989      * Map non-breaking spaces to regular spaces.
1990      * @param lexer Lexer
1991      * @param node Node
1992      */
1993     private void normalizeSpaces(Lexer lexer, Node node)
1994     {
1995         while (node != null)
1996         {
1997             if (node.content != null)
1998             {
1999                 normalizeSpaces(lexer, node.content);
2000             }
2001 
2002             if (node.type == Node.TEXT_NODE)
2003             {
2004                 int i;
2005                 int[] c = new int[1];
2006                 int p = node.start;
2007 
2008                 for (i = node.start; i < node.end; ++i)
2009                 {
2010                     c[0] = node.textarray[i];
2011 
2012                     // look for UTF-8 multibyte character
2013                     if (c[0] > 0x7F)
2014                     {
2015                         i += PPrint.getUTF8(node.textarray, i, c);
2016                     }
2017 
2018                     if (c[0] == 160)
2019                     {
2020                         c[0] = ' ';
2021                     }
2022 
2023                     p = PPrint.putUTF8(node.textarray, p, c[0]);
2024                 }
2025             }
2026 
2027             node = node.next;
2028         }
2029     }
2030 
2031     /**
2032      * Used to hunt for hidden preformatted sections.
2033      * @param node checked node
2034      * @return <code>true</code> if the node has a "margin-top: 0" or "margin-bottom: 0" style
2035      */
2036     boolean noMargins(Node node)
2037     {
2038         AttVal attval = node.getAttrByName("style");
2039 
2040         if (attval == null || attval.value == null)
2041         {
2042             return false;
2043         }
2044 
2045         // search for substring "margin-top: 0"
2046         if (attval.value.indexOf("margin-top: 0") == -1)
2047         {
2048             return false;
2049         }
2050 
2051         // search for substring "margin-top: 0"
2052         if (attval.value.indexOf("margin-bottom: 0") == -1)
2053         {
2054             return false;
2055         }
2056 
2057         return true;
2058     }
2059 
2060     /**
2061      * Does element have a single space as its content?
2062      * @param lexer Lexer
2063      * @param node checked node
2064      * @return <code>true</code> if the element has a single space as its content
2065      */
2066     boolean singleSpace(Lexer lexer, Node node)
2067     {
2068         if (node.content != null)
2069         {
2070             node = node.content;
2071 
2072             if (node.next != null)
2073             {
2074                 return false;
2075             }
2076 
2077             if (node.type != Node.TEXT_NODE)
2078             {
2079                 return false;
2080             }
2081 
2082             if (((node.end - node.start) == 1) && lexer.lexbuf[node.start] == ' ')
2083             {
2084                 return true;
2085             }
2086 
2087             if ((node.end - node.start) == 2)
2088             {
2089                 int[] c = new int[1];
2090 
2091                 PPrint.getUTF8(lexer.lexbuf, node.start, c);
2092 
2093                 if (c[0] == 160)
2094                 {
2095                     return true;
2096                 }
2097             }
2098         }
2099 
2100         return false;
2101     }
2102 
2103     /**
2104      * This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It
2105      * doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags,
2106      * such as o:p which needs to be declared as inline.
2107      * @param lexer Lexer
2108      * @param node node to clean up
2109      */
2110     public void cleanWord2000(Lexer lexer, Node node)
2111     {
2112         // used to a list from a sequence of bulletted p's
2113         Node list = null;
2114 
2115         while (node != null)
2116         {
2117 
2118             // get rid of Word's xmlns attributes
2119             if (node.tag == tt.tagHtml)
2120             {
2121                 // check that it's a Word 2000 document
2122                 if ((node.getAttrByName("xmlns:o") == null))
2123                 {
2124                     return;
2125                 }
2126                 lexer.configuration.tt.freeAttrs(node);
2127             }
2128 
2129             // fix up preformatted sections by looking for a sequence of paragraphs with zero top/bottom margin
2130             if (node.tag == tt.tagP)
2131             {
2132                 if (noMargins(node))
2133                 {
2134                     Node pre;
2135                     Node next;
2136                     Node.coerceNode(lexer, node, tt.tagPre);
2137 
2138                     purgeWord2000Attributes(node);
2139 
2140                     if (node.content != null)
2141                     {
2142                         cleanWord2000(lexer, node.content);
2143                     }
2144 
2145                     pre = node;
2146                     node = node.next;
2147 
2148                     // continue to strip p's
2149                     while (node.tag == tt.tagP && noMargins(node))
2150                     {
2151                         next = node.next;
2152                         node.removeNode();
2153                         pre.insertNodeAtEnd(lexer.newLineNode());
2154                         pre.insertNodeAtEnd(node);
2155                         stripSpan(lexer, node);
2156                         node = next;
2157                     }
2158 
2159                     if (node == null)
2160                     {
2161                         break;
2162                     }
2163                 }
2164             }
2165 
2166             if (node.tag != null && TidyUtils.toBoolean(node.tag.model & Dict.CM_BLOCK) && singleSpace(lexer, node))
2167             {
2168                 node = stripSpan(lexer, node);
2169                 continue;
2170             }
2171 
2172             // discard Word's style verbiage
2173             if (node.tag == this.tt.tagStyle || node.tag == this.tt.tagMeta || node.type == Node.COMMENT_TAG)
2174             {
2175                 node = Node.discardElement(node);
2176                 continue;
2177             }
2178 
2179             // strip out all span and font tags Word scatters so liberally!
2180             if (node.tag == this.tt.tagSpan || node.tag == this.tt.tagFont)
2181             {
2182                 node = stripSpan(lexer, node);
2183                 continue;
2184             }
2185 
2186             if (node.tag == this.tt.tagLink)
2187             {
2188                 AttVal attr = node.getAttrByName("rel");
2189 
2190                 if (attr != null && attr.value != null && attr.value.equals("File-List"))
2191                 {
2192                     node = Node.discardElement(node);
2193                     continue;
2194                 }
2195             }
2196 
2197             // discard empty paragraphs
2198             if (node.content == null && node.tag == this.tt.tagP)
2199             {
2200                 node = Node.discardElement(node);
2201                 continue;
2202             }
2203 
2204             if (node.tag == this.tt.tagP)
2205             {
2206                 AttVal attr = node.getAttrByName("class");
2207                 AttVal atrStyle = node.getAttrByName("style");
2208 
2209                 // (JES) Sometimes Word marks a list item with the following hokie syntax
2210                 // <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2211                 // translate these into <li>
2212 
2213                 // map sequence of <p class="MsoListBullet"> to <ul> ... </ul>
2214                 // map <p class="MsoListNumber"> to <ol>...</ol>
2215                 if (attr != null
2216                     && attr.value != null
2217                     && ((attr.value.equals("MsoListBullet") || attr.value.equals("MsoListNumber")) //
2218                     || (atrStyle != null && (atrStyle.value.indexOf("mso-list:") != -1)))) // 463066 - fix by Joel
2219                 // Shafer 19 Sep 01
2220                 {
2221                     Dict listType = tt.tagUl;
2222 
2223                     if (attr.value.equals("MsoListNumber"))
2224                     {
2225                         listType = tt.tagOl;
2226                     }
2227 
2228                     Node.coerceNode(lexer, node, this.tt.tagLi);
2229 
2230                     if (list == null || list.tag != listType)
2231                     {
2232                         list = lexer.inferredTag(listType.name);
2233                         Node.insertNodeBeforeElement(node, list);
2234                     }
2235 
2236                     purgeWord2000Attributes(node);
2237 
2238                     if (node.content != null)
2239                     {
2240                         cleanWord2000(lexer, node.content);
2241                     }
2242 
2243                     // remove node and append to contents of list
2244                     node.removeNode();
2245                     list.insertNodeAtEnd(node);
2246                     node = list;
2247                 }
2248                 // map sequence of <p class="Code"> to <pre> ... </pre>
2249                 else if (attr != null && attr.value != null && attr.value.equals("Code"))
2250                 {
2251                     Node br = lexer.newLineNode();
2252                     normalizeSpaces(lexer, node);
2253 
2254                     if (list == null || list.tag != this.tt.tagPre)
2255                     {
2256                         list = lexer.inferredTag("pre");
2257                         Node.insertNodeBeforeElement(node, list);
2258                     }
2259 
2260                     // remove node and append to contents of list
2261                     node.removeNode();
2262                     list.insertNodeAtEnd(node);
2263                     stripSpan(lexer, node);
2264                     list.insertNodeAtEnd(br);
2265                     node = list.next;
2266                 }
2267                 else
2268                 {
2269                     list = null;
2270                 }
2271             }
2272             else
2273             {
2274                 list = null;
2275             }
2276 
2277             // strip out style and class attributes
2278             if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2279             {
2280                 purgeWord2000Attributes(node);
2281             }
2282 
2283             if (node.content != null)
2284             {
2285                 cleanWord2000(lexer, node.content);
2286             }
2287 
2288             node = node.next;
2289         }
2290     }
2291 
2292     /**
2293      * Check if the current document is a converted Word document.
2294      * @param root root Node
2295      * @return <code>true</code> if the document has been geenrated by Microsoft Word.
2296      */
2297     public boolean isWord2000(Node root)
2298     {
2299         AttVal attval;
2300         Node node;
2301         Node head;
2302         Node html = root.findHTML(this.tt);
2303 
2304         if (html != null && html.getAttrByName("xmlns:o") != null)
2305         {
2306             return true;
2307         }
2308 
2309         // search for <meta name="GENERATOR" content="Microsoft ...">
2310         head = root.findHEAD(tt);
2311 
2312         if (head != null)
2313         {
2314             for (node = head.content; node != null; node = node.next)
2315             {
2316                 if (node.tag != tt.tagMeta)
2317                 {
2318                     continue;
2319                 }
2320 
2321                 attval = node.getAttrByName("name");
2322 
2323                 if (attval == null || attval.value == null)
2324                 {
2325                     continue;
2326                 }
2327 
2328                 if (!"generator".equals(attval.value))
2329                 {
2330                     continue;
2331                 }
2332 
2333                 attval = node.getAttrByName("content");
2334 
2335                 if (attval == null || attval.value == null)
2336                 {
2337                     continue;
2338                 }
2339 
2340                 if (attval.value.indexOf("Microsoft") != -1)
2341                 {
2342                     return true;
2343                 }
2344             }
2345         }
2346 
2347         return false;
2348     }
2349 
2350     /**
2351      * Where appropriate move object elements from head to body.
2352      * @param lexer Lexer
2353      * @param html html node
2354      */
2355     static void bumpObject(Lexer lexer, Node html)
2356     {
2357         if (html == null)
2358         {
2359             return;
2360         }
2361 
2362         Node node, next, head = null, body = null;
2363         TagTable tt = lexer.configuration.tt;
2364         for (node = html.content; node != null; node = node.next)
2365         {
2366             if (node.tag == tt.tagHead)
2367             {
2368                 head = node;
2369             }
2370 
2371             if (node.tag == tt.tagBody)
2372             {
2373                 body = node;
2374             }
2375         }
2376 
2377         if (head != null && body != null)
2378         {
2379             for (node = head.content; node != null; node = next)
2380             {
2381                 next = node.next;
2382 
2383                 if (node.tag == tt.tagObject)
2384                 {
2385                     Node child;
2386                     boolean bump = false;
2387 
2388                     for (child = node.content; child != null; child = child.next)
2389                     {
2390                         // bump to body unless content is param
2391                         if ((child.type == Node.TEXT_NODE && !node.isBlank(lexer)) || child.tag != tt.tagParam)
2392                         {
2393                             bump = true;
2394                             break;
2395                         }
2396                     }
2397 
2398                     if (bump)
2399                     {
2400                         node.removeNode();
2401                         body.insertNodeAtStart(node);
2402                     }
2403                 }
2404             }
2405         }
2406     }
2407 
2408 }