View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   * 
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights. 
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   * 
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  /***
57   * Clean up misuse of presentation markup. Filters from other formats such as Microsoft Word often make excessive use of
58   * presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is
59   * straight forward to transform this to use CSS. Some rules replace some of the children of an element by style
60   * properties on the element, e.g.
61   * <p>
62   * <b>... </b>
63   * </p>.
64   * <p style="font-weight: bold">
65   * ...
66   * </p>
67   * Such rules are applied to the element's content and then to the element itself until none of the rules more apply.
68   * Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules
69   * strip the element they apply to, replacing it by style properties on the contents, e.g. <dir>
70   * <li>
71   * <p>
72   * ...</li>
73   * </dir>.
74   * <p style="margin-left 1em">
75   * ... These rules are applied to an element before processing its content and replace the current element by the first
76   * element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class
77   * value and style rule in the document head. To support this, an association of styles and class names is built. A
78   * naive approach is to rely on string matching to test when two property lists are the same. A better approach would be
79   * to first sort the properties before matching.
80   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
81   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
82   * @author Fabrizio Giustina
83   * @version $Revision: 1.25 $ ($Author: fgiust $)
84   */
85  public class Clean
86  {
87  
88      /***
89       * sequential number for generated css classes.
90       */
91      private int classNum = 1;
92  
93      /***
94       * Tag table.
95       */
96      private TagTable tt;
97  
98      /***
99       * Instantiates a new Clean.
100      * @param tagTable tag table instance
101      */
102     public Clean(TagTable tagTable)
103     {
104         this.tt = tagTable;
105     }
106 
107     /***
108      * Insert a css style property.
109      * @param props StyleProp instance
110      * @param name property name
111      * @param value property value
112      * @return StyleProp containin the given property
113      */
114     private StyleProp insertProperty(StyleProp props, String name, String value)
115     {
116         StyleProp first, prev, prop;
117         int cmp;
118 
119         prev = null;
120         first = props;
121 
122         while (props != null)
123         {
124             cmp = props.name.compareTo(name);
125 
126             if (cmp == 0)
127             {
128                 // this property is already defined, ignore new value
129                 return first;
130             }
131 
132             if (cmp > 0) // props.name > name
133             {
134                 // insert before this
135 
136                 prop = new StyleProp(name, value, props);
137 
138                 if (prev != null)
139                 {
140                     prev.next = prop;
141                 }
142                 else
143                 {
144                     first = prop;
145                 }
146 
147                 return first;
148             }
149 
150             prev = props;
151             props = props.next;
152         }
153 
154         prop = new StyleProp(name, value, null);
155 
156         if (prev != null)
157         {
158             prev.next = prop;
159         }
160         else
161         {
162             first = prop;
163         }
164 
165         return first;
166     }
167 
168     /***
169      * Create sorted linked list of properties from style string.
170      * @param prop StyleProp
171      * @param style style string
172      * @return StyleProp with given style
173      */
174     private StyleProp createProps(StyleProp prop, String style)
175     {
176         int nameEnd;
177         int valueEnd;
178         int valueStart = 0;
179         int nameStart = 0;
180         boolean more;
181 
182         nameStart = 0;
183         while (nameStart < style.length())
184         {
185             while (nameStart < style.length() && style.charAt(nameStart) == ' ')
186             {
187                 ++nameStart;
188             }
189 
190             nameEnd = nameStart;
191 
192             while (nameEnd < style.length())
193             {
194                 if (style.charAt(nameEnd) == ':')
195                 {
196                     valueStart = nameEnd + 1;
197                     break;
198                 }
199 
200                 ++nameEnd;
201             }
202 
203             if (nameEnd >= style.length() || style.charAt(nameEnd) != ':')
204             {
205                 break;
206             }
207 
208             while (valueStart < style.length() && style.charAt(valueStart) == ' ')
209             {
210                 ++valueStart;
211             }
212 
213             valueEnd = valueStart;
214             more = false;
215 
216             while (valueEnd < style.length())
217             {
218                 if (style.charAt(valueEnd) == ';')
219                 {
220                     more = true;
221                     break;
222                 }
223 
224                 ++valueEnd;
225             }
226 
227             prop = insertProperty(prop, style.substring(nameStart, nameEnd), style.substring(valueStart, valueEnd));
228 
229             if (more)
230             {
231                 nameStart = valueEnd + 1;
232                 continue;
233             }
234 
235             break;
236         }
237 
238         return prop;
239     }
240 
241     /***
242      * Create a css property.
243      * @param props StyleProp
244      * @return css property as String
245      */
246     private String createPropString(StyleProp props)
247     {
248         String style = "";
249         int len;
250         StyleProp prop;
251 
252         // compute length
253         for (len = 0, prop = props; prop != null; prop = prop.next)
254         {
255             len += prop.name.length() + 2;
256             len += prop.value.length() + 2;
257         }
258 
259         for (prop = props; prop != null; prop = prop.next)
260         {
261             style = style.concat(prop.name);
262             style = style.concat(": ");
263 
264             style = style.concat(prop.value);
265 
266             if (prop.next == null)
267             {
268                 break;
269             }
270 
271             style = style.concat("; ");
272         }
273 
274         return style;
275     }
276 
277     /***
278      * Creates a string with merged properties.
279      * @param style css style
280      * @param property css properties
281      * @return merged string
282      */
283     private String addProperty(String style, String property)
284     {
285         StyleProp prop;
286 
287         prop = createProps(null, style);
288         prop = createProps(prop, property);
289         style = createPropString(prop);
290         return style;
291     }
292 
293     /***
294      * Generates a new css class name.
295      * @param lexer Lexer
296      * @param tag Tag
297      * @return generated css class
298      */
299     private String gensymClass(Lexer lexer, String tag)
300     {
301         String str;
302 
303         str = lexer.configuration.cssPrefix == null ? lexer.configuration.cssPrefix + this.classNum : "c"
304             + this.classNum;
305         this.classNum++;
306         return str;
307     }
308 
309     /***
310      * Finds a css style.
311      * @param lexer Lexer
312      * @param tag tag name
313      * @param properties css properties
314      * @return style string
315      */
316     private String findStyle(Lexer lexer, String tag, String properties)
317     {
318         Style style;
319 
320         for (style = lexer.styles; style != null; style = style.next)
321         {
322             if (style.tag.equals(tag) && style.properties.equals(properties))
323             {
324                 return style.tagClass;
325             }
326         }
327 
328         style = new Style(tag, gensymClass(lexer, tag), properties, lexer.styles);
329         lexer.styles = style;
330         return style.tagClass;
331     }
332 
333     /***
334      * Find style attribute in node, and replace it by corresponding class attribute. Search for class in style
335      * dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute.
336      * @param lexer Lexer
337      * @param node node with a style attribute
338      */
339     private void style2Rule(Lexer lexer, Node node)
340     {
341         AttVal styleattr, classattr;
342         String classname;
343 
344         styleattr = node.getAttrByName("style");
345 
346         if (styleattr != null)
347         {
348             classname = findStyle(lexer, node.element, styleattr.value);
349             classattr = node.getAttrByName("class");
350 
351             // if there already is a class attribute then append class name after a space
352 
353             if (classattr != null)
354             {
355                 classattr.value = classattr.value + " " + classname;
356                 node.removeAttribute(styleattr);
357             }
358             else
359             {
360                 // reuse style attribute for class attribute
361                 styleattr.attribute = "class";
362                 styleattr.value = classname;
363             }
364         }
365     }
366 
367     /***
368      * Adds a css rule for color.
369      * @param lexer Lexer
370      * @param selector css selector
371      * @param color color value
372      */
373     private void addColorRule(Lexer lexer, String selector, String color)
374     {
375         if (color != null)
376         {
377             lexer.addStringLiteral(selector);
378             lexer.addStringLiteral(" { color: ");
379             lexer.addStringLiteral(color);
380             lexer.addStringLiteral(" }\n");
381         }
382     }
383 
384     /***
385      * Move presentation attribs from body to style element.
386      * 
387      * <pre>
388      * background="foo" . body { background-image: url(foo) }
389      * bgcolor="foo" . body { background-color: foo }
390      * text="foo" . body { color: foo }
391      * link="foo" . :link { color: foo }
392      * vlink="foo" . :visited { color: foo }
393      * alink="foo" . :active { color: foo }
394      * </pre>
395      * 
396      * @param lexer Lexer
397      * @param body body node
398      */
399     private void cleanBodyAttrs(Lexer lexer, Node body)
400     {
401         AttVal attr;
402         String bgurl = null;
403         String bgcolor = null;
404         String color = null;
405 
406         attr = body.getAttrByName("background");
407 
408         if (attr != null)
409         {
410             bgurl = attr.value;
411             attr.value = null;
412             body.removeAttribute(attr);
413         }
414 
415         attr = body.getAttrByName("bgcolor");
416 
417         if (attr != null)
418         {
419             bgcolor = attr.value;
420             attr.value = null;
421             body.removeAttribute(attr);
422         }
423 
424         attr = body.getAttrByName("text");
425 
426         if (attr != null)
427         {
428             color = attr.value;
429             attr.value = null;
430             body.removeAttribute(attr);
431         }
432 
433         if (bgurl != null || bgcolor != null || color != null)
434         {
435             lexer.addStringLiteral(" body {\n");
436 
437             if (bgurl != null)
438             {
439                 lexer.addStringLiteral("  background-image: url(");
440                 lexer.addStringLiteral(bgurl);
441                 lexer.addStringLiteral(");\n");
442             }
443 
444             if (bgcolor != null)
445             {
446                 lexer.addStringLiteral("  background-color: ");
447                 lexer.addStringLiteral(bgcolor);
448                 lexer.addStringLiteral(";\n");
449             }
450 
451             if (color != null)
452             {
453                 lexer.addStringLiteral("  color: ");
454                 lexer.addStringLiteral(color);
455                 lexer.addStringLiteral(";\n");
456             }
457 
458             lexer.addStringLiteral(" }\n");
459         }
460 
461         attr = body.getAttrByName("link");
462 
463         if (attr != null)
464         {
465             addColorRule(lexer, " :link", attr.value);
466             body.removeAttribute(attr);
467         }
468 
469         attr = body.getAttrByName("vlink");
470 
471         if (attr != null)
472         {
473             addColorRule(lexer, " :visited", attr.value);
474             body.removeAttribute(attr);
475         }
476 
477         attr = body.getAttrByName("alink");
478 
479         if (attr != null)
480         {
481             addColorRule(lexer, " :active", attr.value);
482             body.removeAttribute(attr);
483         }
484     }
485 
486     /***
487      * Check deprecated attributes in body tag.
488      * @param lexer Lexer
489      * @param doc document root node
490      * @return <code>true</code> is the body doesn't contain deprecated attributes, false otherwise.
491      */
492     private boolean niceBody(Lexer lexer, Node doc)
493     {
494         Node body = doc.findBody(lexer.configuration.tt);
495 
496         if (body != null)
497         {
498             if (body.getAttrByName("background") != null
499                 || body.getAttrByName("bgcolor") != null
500                 || body.getAttrByName("text") != null
501                 || body.getAttrByName("link") != null
502                 || body.getAttrByName("vlink") != null
503                 || body.getAttrByName("alink") != null)
504             {
505                 lexer.badLayout |= Report.USING_BODY;
506                 return false;
507             }
508         }
509 
510         return true;
511     }
512 
513     /***
514      * Create style element using rules from dictionary.
515      * @param lexer Lexer
516      * @param doc root node
517      */
518     private void createStyleElement(Lexer lexer, Node doc)
519     {
520         Node node, head, body;
521         Style style;
522         AttVal av;
523 
524         if (lexer.styles == null && niceBody(lexer, doc))
525         {
526             return;
527         }
528 
529         node = lexer.newNode(Node.START_TAG, null, 0, 0, "style");
530         node.implicit = true;
531 
532         // insert type attribute
533         av = new AttVal(null, null, '"', "type", "text/css");
534         av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
535         node.attributes = av;
536 
537         body = doc.findBody(lexer.configuration.tt);
538 
539         lexer.txtstart = lexer.lexsize;
540 
541         if (body != null)
542         {
543             cleanBodyAttrs(lexer, body);
544         }
545 
546         for (style = lexer.styles; style != null; style = style.next)
547         {
548             lexer.addCharToLexer(' ');
549             lexer.addStringLiteral(style.tag);
550             lexer.addCharToLexer('.');
551             lexer.addStringLiteral(style.tagClass);
552             lexer.addCharToLexer(' ');
553             lexer.addCharToLexer('{');
554             lexer.addStringLiteral(style.properties);
555             lexer.addCharToLexer('}');
556             lexer.addCharToLexer('\n');
557         }
558 
559         lexer.txtend = lexer.lexsize;
560 
561         node.insertNodeAtEnd(lexer.newNode(Node.TEXT_NODE, lexer.lexbuf, lexer.txtstart, lexer.txtend));
562 
563         // now insert style element into document head doc is root node. search its children for html node the head
564         // node should be first child of html node
565 
566         head = doc.findHEAD(lexer.configuration.tt);
567 
568         if (head != null)
569         {
570             head.insertNodeAtEnd(node);
571         }
572     }
573 
574     /***
575      * Ensure bidirectional links are consistent.
576      * @param node root node
577      */
578     private void fixNodeLinks(Node node)
579     {
580         Node child;
581 
582         if (node.prev != null)
583         {
584             node.prev.next = node;
585         }
586         else
587         {
588             node.parent.content = node;
589         }
590 
591         if (node.next != null)
592         {
593             node.next.prev = node;
594         }
595         else
596         {
597             node.parent.last = node;
598         }
599 
600         for (child = node.content; child != null; child = child.next)
601         {
602             child.parent = node;
603         }
604     }
605 
606     /***
607      * Used to strip child of node when the node has one and only one child.
608      * @param node parent node
609      */
610     private void stripOnlyChild(Node node)
611     {
612         Node child;
613 
614         child = node.content;
615         node.content = child.content;
616         node.last = child.last;
617         child.content = null;
618 
619         for (child = node.content; child != null; child = child.next)
620         {
621             child.parent = node;
622         }
623     }
624 
625     /***
626      * Used to strip font start and end tags.
627      * @param element original node
628      * @param pnode passed in as array to allow modification. pnode[0] will contain the final node
629      * @todo remove the pnode parameter and make it a return value
630      */
631     private void discardContainer(Node element, Node[] pnode)
632     {
633         Node node;
634         Node parent = element.parent;
635 
636         if (element.content != null)
637         {
638             element.last.next = element.next;
639 
640             if (element.next != null)
641             {
642                 element.next.prev = element.last;
643                 element.last.next = element.next;
644             }
645             else
646             {
647                 parent.last = element.last;
648             }
649 
650             if (element.prev != null)
651             {
652                 element.content.prev = element.prev;
653                 element.prev.next = element.content;
654             }
655             else
656             {
657                 parent.content = element.content;
658             }
659 
660             for (node = element.content; node != null; node = node.next)
661             {
662                 node.parent = parent;
663             }
664 
665             pnode[0] = element.content;
666         }
667         else
668         {
669             if (element.next != null)
670             {
671                 element.next.prev = element.prev;
672             }
673             else
674             {
675                 parent.last = element.prev;
676             }
677 
678             if (element.prev != null)
679             {
680                 element.prev.next = element.next;
681             }
682             else
683             {
684                 parent.content = element.next;
685             }
686 
687             pnode[0] = element.next;
688         }
689 
690         element.next = null;
691         element.content = null;
692     }
693 
694     /***
695      * Add style property to element, creating style attribute as needed and adding ; delimiter.
696      * @param node node
697      * @param property property added to node
698      */
699     private void addStyleProperty(Node node, String property)
700     {
701         AttVal av;
702 
703         for (av = node.attributes; av != null; av = av.next)
704         {
705             if (av.attribute.equals("style"))
706             {
707                 break;
708             }
709         }
710 
711         // if style attribute already exists then insert property
712 
713         if (av != null)
714         {
715             String s;
716 
717             s = addProperty(av.value, property);
718             av.value = s;
719         }
720         else
721         {
722             // else create new style attribute
723             av = new AttVal(node.attributes, null, '"', "style", property);
724             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
725             node.attributes = av;
726         }
727     }
728 
729     /***
730      * Create new string that consists of the combined style properties in s1 and s2. To merge property lists, we build
731      * a linked list of property/values and insert properties into the list in order, merging values for the same
732      * property name.
733      * @param s1 first property
734      * @param s2 second property
735      * @return merged properties
736      */
737     private String mergeProperties(String s1, String s2)
738     {
739         String s;
740         StyleProp prop;
741 
742         prop = createProps(null, s1);
743         prop = createProps(prop, s2);
744         s = createPropString(prop);
745         return s;
746     }
747 
748     /***
749      * Merge class attributes from 2 nodes.
750      * @param node Node
751      * @param child Child node
752      */
753     private void mergeClasses(Node node, Node child)
754     {
755         AttVal av;
756         String s1, s2, names;
757 
758         for (s2 = null, av = child.attributes; av != null; av = av.next)
759         {
760             if ("class".equals(av.attribute))
761             {
762                 s2 = av.value;
763                 break;
764             }
765         }
766 
767         for (s1 = null, av = node.attributes; av != null; av = av.next)
768         {
769             if ("class".equals(av.attribute))
770             {
771                 s1 = av.value;
772                 break;
773             }
774         }
775 
776         if (s1 != null)
777         {
778             if (s2 != null) // merge class names from both
779             {
780                 names = s1 + ' ' + s2;
781                 av.value = names;
782             }
783         }
784         else if (s2 != null) // copy class names from child
785         {
786             av = new AttVal(node.attributes, null, '"', "class", s2);
787             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
788             node.attributes = av;
789         }
790     }
791 
792     /***
793      * Merge style from 2 nodes.
794      * @param node Node
795      * @param child Child node
796      */
797     private void mergeStyles(Node node, Node child)
798     {
799         AttVal av;
800         String s1, s2, style;
801 
802         // the child may have a class attribute used for attaching styles, if so the class name needs to be copied to
803         // node's class
804         mergeClasses(node, child);
805 
806         for (s2 = null, av = child.attributes; av != null; av = av.next)
807         {
808             if (av.attribute.equals("style"))
809             {
810                 s2 = av.value;
811                 break;
812             }
813         }
814 
815         for (s1 = null, av = node.attributes; av != null; av = av.next)
816         {
817             if (av.attribute.equals("style"))
818             {
819                 s1 = av.value;
820                 break;
821             }
822         }
823 
824         if (s1 != null)
825         {
826             if (s2 != null) // merge styles from both
827             {
828                 style = mergeProperties(s1, s2);
829                 av.value = style;
830             }
831         }
832         else if (s2 != null) // copy style of child
833         {
834             av = new AttVal(node.attributes, null, '"', "style", s2);
835             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
836             node.attributes = av;
837         }
838     }
839 
840     /***
841      * Map a % font size to a named font size.
842      * @param size size in %
843      * @return font size name
844      */
845     private String fontSize2Name(String size)
846     {
847         String[] sizes = {"60%", "70%", "80%", null, "120%", "150%", "200%"};
848         String buf;
849 
850         if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6')
851         {
852             int n = size.charAt(0) - '0';
853             return sizes[n];
854         }
855 
856         if (size.length() > 0 && size.charAt(0) == '-')
857         {
858             if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
859             {
860                 int n = size.charAt(1) - '0';
861                 double x;
862 
863                 for (x = 1.0; n > 0; --n)
864                 {
865                     x *= 0.8;
866                 }
867 
868                 x *= 100.0;
869                 buf = "" + (int) x + "%";
870 
871                 return buf;
872             }
873 
874             return "smaller"; /* "70%"; */
875         }
876 
877         if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
878         {
879             int n = size.charAt(1) - '0';
880             double x;
881 
882             for (x = 1.0; n > 0; --n)
883             {
884                 x *= 1.2;
885             }
886 
887             x *= 100.0;
888             buf = "" + (int) x + "%";
889 
890             return buf;
891         }
892 
893         return "larger"; /* "140%" */
894     }
895 
896     /***
897      * Adds a font-family style.
898      * @param node Node
899      * @param face font face
900      */
901     private void addFontFace(Node node, String face)
902     {
903         addStyleProperty(node, "font-family: " + face);
904     }
905 
906     /***
907      * Adds a font size style.
908      * @param node Node
909      * @param size font size
910      */
911     private void addFontSize(Node node, String size)
912     {
913         String value;
914 
915         if (size.equals("6") && node.tag == this.tt.tagP)
916         {
917             node.element = "h1";
918             this.tt.findTag(node);
919             return;
920         }
921 
922         if (size.equals("5") && node.tag == this.tt.tagP)
923         {
924             node.element = "h2";
925             this.tt.findTag(node);
926             return;
927         }
928 
929         if (size.equals("4") && node.tag == this.tt.tagP)
930         {
931             node.element = "h3";
932             this.tt.findTag(node);
933             return;
934         }
935 
936         value = fontSize2Name(size);
937 
938         if (value != null)
939         {
940             addStyleProperty(node, "font-size: " + value);
941         }
942     }
943 
944     /***
945      * Adds a font color style.
946      * @param node Node
947      * @param color color value
948      */
949     private void addFontColor(Node node, String color)
950     {
951         addStyleProperty(node, "color: " + color);
952     }
953 
954     /***
955      * Adds an align style.
956      * @param node Node
957      * @param align align value
958      */
959     private void addAlign(Node node, String align)
960     {
961         // force alignment value to lower case
962         addStyleProperty(node, "text-align: " + align.toLowerCase());
963     }
964 
965     /***
966      * Add style properties to node corresponding to the font face, size and color attributes.
967      * @param node font tag
968      * @param av attribute list for node
969      */
970     private void addFontStyles(Node node, AttVal av)
971     {
972         while (av != null)
973         {
974             if (av.attribute.equals("face"))
975             {
976                 addFontFace(node, av.value);
977             }
978             else if (av.attribute.equals("size"))
979             {
980                 addFontSize(node, av.value);
981             }
982             else if (av.attribute.equals("color"))
983             {
984                 addFontColor(node, av.value);
985             }
986 
987             av = av.next;
988         }
989     }
990 
991     /***
992      * Symptom: <code>&lt;p align=center></code>. Action: <code>&lt;p style="text-align: center"></code>.
993      * @param lexer Lexer
994      * @param node node with center attribute. Will be modified to use css style.
995      */
996     private void textAlign(Lexer lexer, Node node)
997     {
998         AttVal av, prev;
999 
1000         prev = null;
1001 
1002         for (av = node.attributes; av != null; av = av.next)
1003         {
1004             if (av.attribute.equals("align"))
1005             {
1006                 if (prev != null)
1007                 {
1008                     prev.next = av.next;
1009                 }
1010                 else
1011                 {
1012                     node.attributes = av.next;
1013                 }
1014 
1015                 if (av.value != null)
1016                 {
1017                     addAlign(node, av.value);
1018                 }
1019 
1020                 break;
1021             }
1022 
1023             prev = av;
1024         }
1025     }
1026 
1027     /***
1028      * Symptom: <code>&lt;dir>&lt;li></code> where <code>&lt;li></code> is only child. Action: coerce
1029      * <code>&lt;dir> &lt;li></code> to <code>&lt;div></code> with indent. The clean up rules use the pnode argument
1030      * to return the next node when the original node has been deleted.
1031      * @param lexer Lexer
1032      * @param node dir tag
1033      * @return <code>true</code> if a dir tag has been coerced to a div
1034      */
1035     private boolean dir2Div(Lexer lexer, Node node)
1036     {
1037         Node child;
1038 
1039         if (node.tag == this.tt.tagDir || node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1040         {
1041             child = node.content;
1042 
1043             if (child == null)
1044             {
1045                 return false;
1046             }
1047 
1048             // check child has no peers
1049             if (child.next != null)
1050             {
1051                 return false;
1052             }
1053 
1054             if (child.tag != this.tt.tagLi)
1055             {
1056                 return false;
1057             }
1058 
1059             if (!child.implicit)
1060             {
1061                 return false;
1062             }
1063 
1064             // coerce dir to div
1065             node.tag = this.tt.tagDiv;
1066             node.element = "div";
1067             addStyleProperty(node, "margin-left: 2em");
1068             stripOnlyChild(node);
1069             return true;
1070         }
1071 
1072         return false;
1073     }
1074 
1075     /***
1076      * Symptom:
1077      * 
1078      * <pre>
1079      * &lt;center>
1080      * </pre>.
1081      * <p>
1082      * Action: replace <code>&lt;center></code> by <code>&lt;div style="text-align: center"></code>
1083      * </p>
1084      * @param lexer Lexer
1085      * @param node center tag
1086      * @param pnode pnode[0] is the same as node, passed in as an array to allow modification
1087      * @return <code>true</code> if a center tag has been replaced by a div
1088      */
1089     private boolean center2Div(Lexer lexer, Node node, Node[] pnode)
1090     {
1091         if (node.tag == this.tt.tagCenter)
1092         {
1093             if (lexer.configuration.dropFontTags)
1094             {
1095                 if (node.content != null)
1096                 {
1097                     Node last = node.last;
1098                     Node parent = node.parent;
1099 
1100                     discardContainer(node, pnode);
1101 
1102                     node = lexer.inferredTag("br");
1103 
1104                     if (last.next != null)
1105                     {
1106                         last.next.prev = node;
1107                     }
1108 
1109                     node.next = last.next;
1110                     last.next = node;
1111                     node.prev = last;
1112 
1113                     if (parent.last == last)
1114                     {
1115                         parent.last = node;
1116                     }
1117 
1118                     node.parent = parent;
1119                 }
1120                 else
1121                 {
1122                     Node prev = node.prev;
1123                     Node next = node.next;
1124                     Node parent = node.parent;
1125                     discardContainer(node, pnode);
1126 
1127                     node = lexer.inferredTag("br");
1128                     node.next = next;
1129                     node.prev = prev;
1130                     node.parent = parent;
1131 
1132                     if (next != null)
1133                     {
1134                         next.prev = node;
1135                     }
1136                     else
1137                     {
1138                         parent.last = node;
1139                     }
1140 
1141                     if (prev != null)
1142                     {
1143                         prev.next = node;
1144                     }
1145                     else
1146                     {
1147                         parent.content = node;
1148                     }
1149                 }
1150 
1151                 return true;
1152             }
1153             node.tag = this.tt.tagDiv;
1154             node.element = "div";
1155             addStyleProperty(node, "text-align: center");
1156             return true;
1157         }
1158 
1159         return false;
1160     }
1161 
1162     /***
1163      * Symptom: <code>&lt;div>&lt;div>...&lt;/div>&lt;/div></code> Action: merge the two divs. This is useful after
1164      * nested &lt;dir>s used by Word for indenting have been converted to &lt;div>s.
1165      * @param lexer Lexer
1166      * @param node first div
1167      * @return true if the divs have been merged
1168      */
1169     private boolean mergeDivs(Lexer lexer, Node node)
1170     {
1171         Node child;
1172 
1173         if (node.tag != this.tt.tagDiv)
1174         {
1175             return false;
1176         }
1177 
1178         child = node.content;
1179 
1180         if (child == null)
1181         {
1182             return false;
1183         }
1184 
1185         if (child.tag != this.tt.tagDiv)
1186         {
1187             return false;
1188         }
1189 
1190         if (child.next != null)
1191         {
1192             return false;
1193         }
1194 
1195         mergeStyles(node, child);
1196         stripOnlyChild(node);
1197         return true;
1198     }
1199 
1200     /***
1201      * Symptom:
1202      * <ul>
1203      * <li>
1204      * <ul>
1205      * ...
1206      * </ul>
1207      * </li>
1208      * </ul>
1209      * Action: discard outer list.
1210      * @param lexer Lexer
1211      * @param node Node
1212      * @param pnode passed in as array to allow modifications.
1213      * @return <code>true</code> if nested lists have been found and replaced
1214      */
1215     private boolean nestedList(Lexer lexer, Node node, Node[] pnode)
1216     {
1217         Node child, list;
1218 
1219         if (node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1220         {
1221             child = node.content;
1222 
1223             if (child == null)
1224             {
1225                 return false;
1226             }
1227 
1228             // check child has no peers
1229 
1230             if (child.next != null)
1231             {
1232                 return false;
1233             }
1234 
1235             list = child.content;
1236 
1237             if (list == null)
1238             {
1239                 return false;
1240             }
1241 
1242             if (list.tag != node.tag)
1243             {
1244                 return false;
1245             }
1246 
1247             pnode[0] = list; // Set node to resume iteration
1248 
1249             // move inner list node into position of outer node
1250             list.prev = node.prev;
1251             list.next = node.next;
1252             list.parent = node.parent;
1253             fixNodeLinks(list);
1254 
1255             // get rid of outer ul and its li
1256             // XXX: Are we leaking the child node? -creitzel 7 Jun, 01
1257             child.content = null;
1258             node.content = null;
1259             node.next = null;
1260             node = null;
1261 
1262             // If prev node was a list the chances are this node should be appended to that list. Word has no way of
1263             // recognizing nested lists and just uses indents
1264             if (list.prev != null)
1265             {
1266                 if (list.prev.tag == this.tt.tagUl || list.prev.tag == this.tt.tagOl)
1267                 {
1268 
1269                     node = list;
1270                     list = node.prev;
1271 
1272                     list.next = node.next;
1273 
1274                     if (list.next != null)
1275                     {
1276                         list.next.prev = list;
1277                     }
1278 
1279                     child = list.last; /* <li> */
1280 
1281                     node.parent = child;
1282                     node.next = null;
1283                     node.prev = child.last;
1284                     fixNodeLinks(node);
1285                     cleanNode(lexer, node);
1286                 }
1287             }
1288 
1289             return true;
1290         }
1291 
1292         return false;
1293     }
1294 
1295     /***
1296      * Symptom: the only child of a block-level element is a presentation element such as B, I or FONT. Action: add
1297      * style "font-weight: bold" to the block and strip the &lt;b>element, leaving its children. example:
1298      * 
1299      * <pre>
1300      * &lt;p>
1301      * &lt;b>&lt;font face="Arial" size="6">Draft Recommended Practice&lt;/font>&lt;/b>
1302      * &lt;/p>
1303      * </pre>
1304      * 
1305      * becomes:
1306      * 
1307      * <pre>
1308      * &lt;p style="font-weight: bold; font-family: Arial; font-size: 6">
1309      * Draft Recommended Practice
1310      * &lt;/p>
1311      * </pre>
1312      * 
1313      * <p>
1314      * This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator
1315      * 4, this isn't done for the elements: caption, tr and table
1316      * </p>
1317      * @param lexer Lexer
1318      * @param node parent node
1319      * @return <code>true</code> if the child node has been removed
1320      */
1321     private boolean blockStyle(Lexer lexer, Node node)
1322     {
1323         Node child;
1324 
1325         if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1326         {
1327             if (node.tag != this.tt.tagTable && node.tag != this.tt.tagTr && node.tag != this.tt.tagLi)
1328             {
1329                 // check for align attribute
1330                 if (node.tag != this.tt.tagCaption)
1331                 {
1332                     textAlign(lexer, node);
1333                 }
1334 
1335                 child = node.content;
1336 
1337                 if (child == null)
1338                 {
1339                     return false;
1340                 }
1341 
1342                 // check child has no peers
1343                 if (child.next != null)
1344                 {
1345                     return false;
1346                 }
1347 
1348                 if (child.tag == this.tt.tagB)
1349                 {
1350                     mergeStyles(node, child);
1351                     addStyleProperty(node, "font-weight: bold");
1352                     stripOnlyChild(node);
1353                     return true;
1354                 }
1355 
1356                 if (child.tag == this.tt.tagI)
1357                 {
1358                     mergeStyles(node, child);
1359                     addStyleProperty(node, "font-style: italic");
1360                     stripOnlyChild(node);
1361                     return true;
1362                 }
1363 
1364                 if (child.tag == this.tt.tagFont)
1365                 {
1366                     mergeStyles(node, child);
1367                     addFontStyles(node, child.attributes);
1368                     stripOnlyChild(node);
1369                     return true;
1370                 }
1371             }
1372         }
1373 
1374         return false;
1375     }
1376 
1377     /***
1378      * If the node has only one b, i, or font child remove the child node and add the appropriate style attributes to
1379      * parent.
1380      * @param lexer Lexer
1381      * @param node parent node
1382      * @param pnode passed as an array to allow modifications
1383      * @return <code>true</code> if child node has been stripped, replaced by style attributes.
1384      */
1385     private boolean inlineStyle(Lexer lexer, Node node, Node[] pnode)
1386     {
1387         Node child;
1388 
1389         if (node.tag != this.tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0)
1390         {
1391             child = node.content;
1392 
1393             if (child == null)
1394             {
1395                 return false;
1396             }
1397 
1398             // check child has no peers
1399             if (child.next != null)
1400             {
1401                 return false;
1402             }
1403 
1404             if (child.tag == this.tt.tagB && lexer.configuration.logicalEmphasis)
1405             {
1406                 mergeStyles(node, child);
1407                 addStyleProperty(node, "font-weight: bold");
1408                 stripOnlyChild(node);
1409                 return true;
1410             }
1411 
1412             if (child.tag == this.tt.tagI && lexer.configuration.logicalEmphasis)
1413             {
1414                 mergeStyles(node, child);
1415                 addStyleProperty(node, "font-style: italic");
1416                 stripOnlyChild(node);
1417                 return true;
1418             }
1419 
1420             if (child.tag == this.tt.tagFont)
1421             {
1422                 mergeStyles(node, child);
1423                 addFontStyles(node, child.attributes);
1424                 stripOnlyChild(node);
1425                 return true;
1426             }
1427         }
1428 
1429         return false;
1430     }
1431 
1432     /***
1433      * Replace font elements by span elements, deleting the font element's attributes and replacing them by a single
1434      * style attribute.
1435      * @param lexer Lexer
1436      * @param node font tag
1437      * @param pnode passed as an array to allow modifications
1438      * @return <code>true</code> if a font tag has been dropped and replaced by style attributes
1439      */
1440     private boolean font2Span(Lexer lexer, Node node, Node[] pnode)
1441     {
1442         AttVal av, style, next;
1443 
1444         if (node.tag == this.tt.tagFont)
1445         {
1446             if (lexer.configuration.dropFontTags)
1447             {
1448                 discardContainer(node, pnode);
1449                 return false;
1450             }
1451 
1452             // if FONT is only child of parent element then leave alone
1453             if (node.parent.content == node && node.next == null)
1454             {
1455                 return false;
1456             }
1457 
1458             addFontStyles(node, node.attributes);
1459 
1460             // extract style attribute and free the rest
1461             av = node.attributes;
1462             style = null;
1463 
1464             while (av != null)
1465             {
1466                 next = av.next;
1467 
1468                 if (av.attribute.equals("style"))
1469                 {
1470                     av.next = null;
1471                     style = av;
1472                 }
1473 
1474                 av = next;
1475             }
1476 
1477             node.attributes = style;
1478 
1479             node.tag = this.tt.tagSpan;
1480             node.element = "span";
1481 
1482             return true;
1483         }
1484 
1485         return false;
1486     }
1487 
1488     /***
1489      * Applies all matching rules to a node.
1490      * @param lexer Lexer
1491      * @param node original node
1492      * @return cleaned up node
1493      */
1494     private Node cleanNode(Lexer lexer, Node node)
1495     {
1496         Node next = null;
1497         Node[] o = new Node[1];
1498         boolean b = false;
1499 
1500         for (next = node; node != null && node.isElement(); node = next)
1501         {
1502             o[0] = next;
1503 
1504             b = dir2Div(lexer, node);
1505             next = o[0];
1506             if (b)
1507             {
1508                 continue;
1509             }
1510 
1511             // Special case: true result means that arg node and its parent no longer exist.
1512             // So we must jump back up the CreateStyleProperties() call stack until we have a valid node reference.
1513             b = nestedList(lexer, node, o);
1514             next = o[0];
1515             if (b)
1516             {
1517                 return next;
1518             }
1519 
1520             b = center2Div(lexer, node, o);
1521             next = o[0];
1522             if (b)
1523             {
1524                 continue;
1525             }
1526 
1527             b = mergeDivs(lexer, node);
1528             next = o[0];
1529             if (b)
1530             {
1531                 continue;
1532             }
1533 
1534             b = blockStyle(lexer, node);
1535             next = o[0];
1536             if (b)
1537             {
1538                 continue;
1539             }
1540 
1541             b = inlineStyle(lexer, node, o);
1542             next = o[0];
1543             if (b)
1544             {
1545                 continue;
1546             }
1547 
1548             b = font2Span(lexer, node, o);
1549             next = o[0];
1550             if (b)
1551             {
1552                 continue;
1553             }
1554 
1555             break;
1556         }
1557 
1558         return next;
1559     }
1560 
1561     /***
1562      * Special case: if the current node is destroyed by CleanNode() lower in the tree, this node and its parent no
1563      * longer exist. So we must jump back up the CreateStyleProperties() call stack until we have a valid node
1564      * reference.
1565      * @param lexer Lexer
1566      * @param node Node
1567      * @param prepl passed in as array to allow modifications
1568      * @return cleaned Node
1569      */
1570     private Node createStyleProperties(Lexer lexer, Node node, Node[] prepl)
1571     {
1572         Node child;
1573 
1574         if (node.content != null)
1575         {
1576             Node[] repl = new Node[1];
1577             repl[0] = node;
1578             for (child = node.content; child != null; child = child.next)
1579             {
1580                 child = createStyleProperties(lexer, child, repl);
1581                 if (repl[0] != node)
1582                 {
1583                     return repl[0];
1584                 }
1585             }
1586         }
1587 
1588         return cleanNode(lexer, node);
1589     }
1590 
1591     /***
1592      * Find style attribute in node content, and replace it by corresponding class attribute.
1593      * @param lexer Lexer
1594      * @param node parent node
1595      */
1596     private void defineStyleRules(Lexer lexer, Node node)
1597     {
1598         Node child;
1599 
1600         if (node.content != null)
1601         {
1602             child = node.content;
1603             while (child != null)
1604             {
1605                 defineStyleRules(lexer, child);
1606                 child = child.next;
1607             }
1608         }
1609 
1610         style2Rule(lexer, node);
1611     }
1612 
1613     /***
1614      * Clean an html tree.
1615      * @param lexer Lexer
1616      * @param doc root node
1617      */
1618     public void cleanTree(Lexer lexer, Node doc)
1619     {
1620         Node[] repl = new Node[1];
1621         repl[0] = doc;
1622         doc = createStyleProperties(lexer, doc, repl);
1623 
1624         if (!lexer.configuration.makeClean)
1625         {
1626             defineStyleRules(lexer, doc);
1627             createStyleElement(lexer, doc);
1628         }
1629     }
1630 
1631     /***
1632      * simplifies <b><b>... </b> ... </b> etc.
1633      * @param node root Node
1634      */
1635     public void nestedEmphasis(Node node)
1636     {
1637         Node[] o = new Node[1];
1638         Node next;
1639 
1640         while (node != null)
1641         {
1642             next = node.next;
1643 
1644             if ((node.tag == this.tt.tagB || node.tag == this.tt.tagI)
1645                 && node.parent != null
1646                 && node.parent.tag == node.tag)
1647             {
1648                 // strip redundant inner element
1649                 o[0] = next;
1650                 discardContainer(node, o);
1651                 next = o[0];
1652                 node = next;
1653                 continue;
1654             }
1655 
1656             if (node.content != null)
1657             {
1658                 nestedEmphasis(node.content);
1659             }
1660 
1661             node = next;
1662         }
1663     }
1664 
1665     /***
1666      * Replace i by em and b by strong.
1667      * @param node root Node
1668      */
1669     public void emFromI(Node node)
1670     {
1671         while (node != null)
1672         {
1673             if (node.tag == this.tt.tagI)
1674             {
1675                 node.element = this.tt.tagEm.name;
1676                 node.tag = this.tt.tagEm;
1677             }
1678             else if (node.tag == this.tt.tagB)
1679             {
1680                 node.element = this.tt.tagStrong.name;
1681                 node.tag = this.tt.tagStrong;
1682             }
1683 
1684             if (node.content != null)
1685             {
1686                 emFromI(node.content);
1687             }
1688 
1689             node = node.next;
1690         }
1691     }
1692 
1693     /***
1694      * Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single
1695      * implicit li. This is recursively replaced by an implicit blockquote.
1696      * @param node root Node
1697      */
1698     public void list2BQ(Node node)
1699     {
1700         while (node != null)
1701         {
1702             if (node.content != null)
1703             {
1704                 list2BQ(node.content);
1705             }
1706 
1707             if (node.tag != null
1708                 && node.tag.getParser() == ParserImpl.LIST
1709                 && node.hasOneChild()
1710                 && node.content.implicit)
1711             {
1712                 stripOnlyChild(node);
1713                 node.element = this.tt.tagBlockquote.name;
1714                 node.tag = this.tt.tagBlockquote;
1715                 node.implicit = true;
1716             }
1717 
1718             node = node.next;
1719         }
1720     }
1721 
1722     /***
1723      * Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with
1724      * the indent set to match the nesting depth.
1725      * @param node root Node
1726      */
1727     public void bQ2Div(Node node)
1728     {
1729         int indent;
1730         String indentBuf;
1731         AttVal attval;
1732 
1733         while (node != null)
1734         {
1735             if (node.tag == this.tt.tagBlockquote && node.implicit)
1736             {
1737                 indent = 1;
1738 
1739                 while (node.hasOneChild() && node.content.tag == this.tt.tagBlockquote && node.implicit)
1740                 {
1741                     ++indent;
1742                     stripOnlyChild(node);
1743                 }
1744 
1745                 if (node.content != null)
1746                 {
1747                     bQ2Div(node.content);
1748                 }
1749 
1750                 indentBuf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1751 
1752                 node.element = this.tt.tagDiv.name;
1753                 node.tag = this.tt.tagDiv;
1754 
1755                 attval = node.getAttrByName("style");
1756 
1757                 if (attval != null && attval.value != null)
1758                 {
1759                     attval.value = indentBuf + "; " + attval.value;
1760                 }
1761                 else
1762                 {
1763                     node.addAttribute("style", indentBuf);
1764                 }
1765             }
1766             else if (node.content != null)
1767             {
1768                 bQ2Div(node.content);
1769             }
1770 
1771             node = node.next;
1772         }
1773     }
1774 
1775     /***
1776      * Find the enclosing table cell for the given node.
1777      * @param node Node
1778      * @return enclosing cell node
1779      */
1780     Node findEnclosingCell(Node node)
1781     {
1782         Node check;
1783 
1784         for (check = node; check != null; check = check.parent)
1785         {
1786             if (check.tag == tt.tagTd)
1787             {
1788                 return check;
1789             }
1790         }
1791         return null;
1792     }
1793 
1794     /***
1795      * node is <code>&lt;![if ...]&gt;</code> prune up to <code>&lt;![endif]&gt;</code>.
1796      * @param lexer Lexer
1797      * @param node Node
1798      * @return cleaned up Node
1799      */
1800     public Node pruneSection(Lexer lexer, Node node)
1801     {
1802         for (;;)
1803         {
1804 
1805             // FG: commented out - don't add &nbsp; to empty cells
1806 
1807             // if ((Lexer.getString(node.textarray, node.start, 21)).equals("if !supportEmptyParas"))
1808             // {
1809             //     Node cell = findEnclosingCell(node);
1810             //     if (cell != null)
1811             //     {
1812             //         // Need to put &nbsp; into cell so it doesn't look weird
1813             //         char onesixty[] = {(char) 160, (char) 0};
1814             //         Node nbsp = lexer.newLiteralTextNode(lexer, onesixty);
1815             //         Node.insertNodeBeforeElement(node, nbsp);
1816             //      }
1817             //  }
1818 
1819             // discard node and returns next
1820             node = Node.discardElement(node);
1821 
1822             if (node == null)
1823             {
1824                 return null;
1825             }
1826 
1827             if (node.type == Node.SECTION_TAG)
1828             {
1829                 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if"))
1830                 {
1831                     node = pruneSection(lexer, node);
1832                     continue;
1833                 }
1834 
1835                 if ((TidyUtils.getString(node.textarray, node.start, 5)).equals("endif"))
1836                 {
1837                     node = Node.discardElement(node);
1838                     break;
1839                 }
1840             }
1841         }
1842 
1843         return node;
1844     }
1845 
1846     /***
1847      * Drop if/endif sections inserted by word2000.
1848      * @param lexer Lexer
1849      * @param node Node root node
1850      */
1851     public void dropSections(Lexer lexer, Node node)
1852     {
1853         while (node != null)
1854         {
1855             if (node.type == Node.SECTION_TAG)
1856             {
1857                 // prune up to matching endif
1858                 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if")
1859                     && (!(TidyUtils.getString(node.textarray, node.start, 7)).equals("if !vml"))) // #444394 - fix 13
1860                 // Sep 01
1861                 {
1862                     node = pruneSection(lexer, node);
1863                     continue;
1864                 }
1865 
1866                 // discard others as well
1867                 node = Node.discardElement(node);
1868                 continue;
1869             }
1870 
1871             if (node.content != null)
1872             {
1873                 dropSections(lexer, node.content);
1874             }
1875 
1876             node = node.next;
1877         }
1878     }
1879 
1880     /***
1881      * Remove word2000 attributes from node.
1882      * @param node node to cleanup
1883      */
1884     public void purgeWord2000Attributes(Node node)
1885     {
1886         AttVal attr = null;
1887         AttVal next = null;
1888         AttVal prev = null;
1889 
1890         for (attr = node.attributes; attr != null; attr = next)
1891         {
1892             next = attr.next;
1893 
1894             // special check for class="Code" denoting pre text
1895             // Pass thru user defined styles as HTML class names
1896             if (attr.attribute != null && attr.value != null && attr.attribute.equals("class"))
1897             {
1898                 if (attr.value.equals("Code") || !attr.value.startsWith("Mso"))
1899                 {
1900                     prev = attr;
1901                     continue;
1902                 }
1903             }
1904 
1905             if (attr.attribute != null
1906                 && (attr.attribute.equals("class")
1907                     || attr.attribute.equals("style")
1908                     || attr.attribute.equals("lang")
1909                     || attr.attribute.startsWith("x:") || ((attr.attribute.equals("height") || attr.attribute
1910                     .equals("width")) && //
1911                 (node.tag == this.tt.tagTd || node.tag == this.tt.tagTr || node.tag == this.tt.tagTh))))
1912             {
1913                 if (prev != null)
1914                 {
1915                     prev.next = next;
1916                 }
1917                 else
1918                 {
1919                     node.attributes = next;
1920                 }
1921 
1922             }
1923             else
1924             {
1925                 prev = attr;
1926             }
1927         }
1928     }
1929 
1930     /***
1931      * Word2000 uses span excessively, so we strip span out.
1932      * @param lexer Lexer
1933      * @param span Node span
1934      * @return cleaned node
1935      */
1936     public Node stripSpan(Lexer lexer, Node span)
1937     {
1938         Node node;
1939         Node prev = null;
1940         Node content;
1941 
1942         // deal with span elements that have content by splicing the content in place of the span after having
1943         // processed it
1944 
1945         cleanWord2000(lexer, span.content);
1946         content = span.content;
1947 
1948         if (span.prev != null)
1949         {
1950             prev = span.prev;
1951         }
1952         else if (content != null)
1953         {
1954             node = content;
1955             content = content.next;
1956             node.removeNode();
1957             Node.insertNodeBeforeElement(span, node);
1958             prev = node;
1959         }
1960 
1961         while (content != null)
1962         {
1963             node = content;
1964             content = content.next;
1965             node.removeNode();
1966             prev.insertNodeAfterElement(node);
1967             prev = node;
1968         }
1969 
1970         if (span.next == null)
1971         {
1972             span.parent.last = prev;
1973         }
1974 
1975         node = span.next;
1976         span.content = null;
1977         Node.discardElement(span);
1978         return node;
1979     }
1980 
1981     /***
1982      * Map non-breaking spaces to regular spaces.
1983      * @param lexer Lexer
1984      * @param node Node
1985      */
1986     private void normalizeSpaces(Lexer lexer, Node node)
1987     {
1988         while (node != null)
1989         {
1990             if (node.content != null)
1991             {
1992                 normalizeSpaces(lexer, node.content);
1993             }
1994 
1995             if (node.type == Node.TEXT_NODE)
1996             {
1997                 int i;
1998                 int[] c = new int[1];
1999                 int p = node.start;
2000 
2001                 for (i = node.start; i < node.end; ++i)
2002                 {
2003                     c[0] = node.textarray[i];
2004 
2005                     // look for UTF-8 multibyte character
2006                     if (c[0] > 0x7F)
2007                     {
2008                         i += PPrint.getUTF8(node.textarray, i, c);
2009                     }
2010 
2011                     if (c[0] == 160)
2012                     {
2013                         c[0] = ' ';
2014                     }
2015 
2016                     p = PPrint.putUTF8(node.textarray, p, c[0]);
2017                 }
2018             }
2019 
2020             node = node.next;
2021         }
2022     }
2023 
2024     /***
2025      * Used to hunt for hidden preformatted sections.
2026      * @param node checked node
2027      * @return <code>true</code> if the node has a "margin-top: 0" or "margin-bottom: 0" style
2028      */
2029     boolean noMargins(Node node)
2030     {
2031         AttVal attval = node.getAttrByName("style");
2032 
2033         if (attval == null || attval.value == null)
2034         {
2035             return false;
2036         }
2037 
2038         // search for substring "margin-top: 0"
2039         if (attval.value.indexOf("margin-top: 0") == -1)
2040         {
2041             return false;
2042         }
2043 
2044         // search for substring "margin-top: 0"
2045         if (attval.value.indexOf("margin-bottom: 0") == -1)
2046         {
2047             return false;
2048         }
2049 
2050         return true;
2051     }
2052 
2053     /***
2054      * Does element have a single space as its content?
2055      * @param lexer Lexer
2056      * @param node checked node
2057      * @return <code>true</code> if the element has a single space as its content
2058      */
2059     boolean singleSpace(Lexer lexer, Node node)
2060     {
2061         if (node.content != null)
2062         {
2063             node = node.content;
2064 
2065             if (node.next != null)
2066             {
2067                 return false;
2068             }
2069 
2070             if (node.type != Node.TEXT_NODE)
2071             {
2072                 return false;
2073             }
2074 
2075             if (((node.end - node.start) == 1) && lexer.lexbuf[node.start] == ' ')
2076             {
2077                 return true;
2078             }
2079 
2080             if ((node.end - node.start) == 2)
2081             {
2082                 int[] c = new int[1];
2083 
2084                 PPrint.getUTF8(lexer.lexbuf, node.start, c);
2085 
2086                 if (c[0] == 160)
2087                 {
2088                     return true;
2089                 }
2090             }
2091         }
2092 
2093         return false;
2094     }
2095 
2096     /***
2097      * This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It
2098      * doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags,
2099      * such as o:p which needs to be declared as inline.
2100      * @param lexer Lexer
2101      * @param node node to clean up
2102      */
2103     public void cleanWord2000(Lexer lexer, Node node)
2104     {
2105         // used to a list from a sequence of bulletted p's
2106         Node list = null;
2107 
2108         while (node != null)
2109         {
2110 
2111             // get rid of Word's xmlns attributes
2112             if (node.tag == tt.tagHtml)
2113             {
2114                 // check that it's a Word 2000 document
2115                 if ((node.getAttrByName("xmlns:o") == null))
2116                 {
2117                     return;
2118                 }
2119                 lexer.configuration.tt.freeAttrs(node);
2120             }
2121 
2122             // fix up preformatted sections by looking for a sequence of paragraphs with zero top/bottom margin
2123             if (node.tag == tt.tagP)
2124             {
2125                 if (noMargins(node))
2126                 {
2127                     Node pre;
2128                     Node next;
2129                     Node.coerceNode(lexer, node, tt.tagPre);
2130 
2131                     purgeWord2000Attributes(node);
2132 
2133                     if (node.content != null)
2134                     {
2135                         cleanWord2000(lexer, node.content);
2136                     }
2137 
2138                     pre = node;
2139                     node = node.next;
2140 
2141                     // continue to strip p's
2142                     while (node.tag == tt.tagP && noMargins(node))
2143                     {
2144                         next = node.next;
2145                         node.removeNode();
2146                         pre.insertNodeAtEnd(lexer.newLineNode());
2147                         pre.insertNodeAtEnd(node);
2148                         stripSpan(lexer, node);
2149                         node = next;
2150                     }
2151 
2152                     if (node == null)
2153                     {
2154                         break;
2155                     }
2156                 }
2157             }
2158 
2159             if (node.tag != null && TidyUtils.toBoolean(node.tag.model & Dict.CM_BLOCK) && singleSpace(lexer, node))
2160             {
2161                 node = stripSpan(lexer, node);
2162                 continue;
2163             }
2164 
2165             // discard Word's style verbiage
2166             if (node.tag == this.tt.tagStyle || node.tag == this.tt.tagMeta || node.type == Node.COMMENT_TAG)
2167             {
2168                 node = Node.discardElement(node);
2169                 continue;
2170             }
2171 
2172             // strip out all span and font tags Word scatters so liberally!
2173             if (node.tag == this.tt.tagSpan || node.tag == this.tt.tagFont)
2174             {
2175                 node = stripSpan(lexer, node);
2176                 continue;
2177             }
2178 
2179             if (node.tag == this.tt.tagLink)
2180             {
2181                 AttVal attr = node.getAttrByName("rel");
2182 
2183                 if (attr != null && attr.value != null && attr.value.equals("File-List"))
2184                 {
2185                     node = Node.discardElement(node);
2186                     continue;
2187                 }
2188             }
2189 
2190             // discard empty paragraphs
2191             if (node.content == null && node.tag == this.tt.tagP)
2192             {
2193                 node = Node.discardElement(node);
2194                 continue;
2195             }
2196 
2197             if (node.tag == this.tt.tagP)
2198             {
2199                 AttVal attr = node.getAttrByName("class");
2200                 AttVal atrStyle = node.getAttrByName("style");
2201 
2202                 // (JES) Sometimes Word marks a list item with the following hokie syntax
2203                 // <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2204                 // translate these into <li>
2205 
2206                 // map sequence of <p class="MsoListBullet"> to <ul> ... </ul>
2207                 // map <p class="MsoListNumber"> to <ol>...</ol>
2208                 if (attr != null
2209                     && attr.value != null
2210                     && ((attr.value.equals("MsoListBullet") || attr.value.equals("MsoListNumber")) //
2211                     || (atrStyle != null && (atrStyle.value.indexOf("mso-list:") != -1)))) // 463066 - fix by Joel
2212                 // Shafer 19 Sep 01
2213                 {
2214                     Dict listType = tt.tagUl;
2215 
2216                     if (attr.value.equals("MsoListNumber"))
2217                     {
2218                         listType = tt.tagOl;
2219                     }
2220 
2221                     Node.coerceNode(lexer, node, this.tt.tagLi);
2222 
2223                     if (list == null || list.tag != listType)
2224                     {
2225                         list = lexer.inferredTag(listType.name);
2226                         Node.insertNodeBeforeElement(node, list);
2227                     }
2228 
2229                     purgeWord2000Attributes(node);
2230 
2231                     if (node.content != null)
2232                     {
2233                         cleanWord2000(lexer, node.content);
2234                     }
2235 
2236                     // remove node and append to contents of list
2237                     node.removeNode();
2238                     list.insertNodeAtEnd(node);
2239                     node = list;
2240                 }
2241                 // map sequence of <p class="Code"> to <pre> ... </pre>
2242                 else if (attr != null && attr.value != null && attr.value.equals("Code"))
2243                 {
2244                     Node br = lexer.newLineNode();
2245                     normalizeSpaces(lexer, node);
2246 
2247                     if (list == null || list.tag != this.tt.tagPre)
2248                     {
2249                         list = lexer.inferredTag("pre");
2250                         Node.insertNodeBeforeElement(node, list);
2251                     }
2252 
2253                     // remove node and append to contents of list
2254                     node.removeNode();
2255                     list.insertNodeAtEnd(node);
2256                     stripSpan(lexer, node);
2257                     list.insertNodeAtEnd(br);
2258                     node = list.next;
2259                 }
2260                 else
2261                 {
2262                     list = null;
2263                 }
2264             }
2265             else
2266             {
2267                 list = null;
2268             }
2269 
2270             // strip out style and class attributes
2271             if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2272             {
2273                 purgeWord2000Attributes(node);
2274             }
2275 
2276             if (node.content != null)
2277             {
2278                 cleanWord2000(lexer, node.content);
2279             }
2280 
2281             node = node.next;
2282         }
2283     }
2284 
2285     /***
2286      * Check if the current document is a converted Word document.
2287      * @param root root Node
2288      * @return <code>true</code> if the document has been geenrated by Microsoft Word.
2289      */
2290     public boolean isWord2000(Node root)
2291     {
2292         AttVal attval;
2293         Node node;
2294         Node head;
2295         Node html = root.findHTML(this.tt);
2296 
2297         if (html != null && html.getAttrByName("xmlns:o") != null)
2298         {
2299             return true;
2300         }
2301 
2302         // search for <meta name="GENERATOR" content="Microsoft ...">
2303         head = root.findHEAD(tt);
2304 
2305         if (head != null)
2306         {
2307             for (node = head.content; node != null; node = node.next)
2308             {
2309                 if (node.tag != tt.tagMeta)
2310                 {
2311                     continue;
2312                 }
2313 
2314                 attval = node.getAttrByName("name");
2315 
2316                 if (attval == null || attval.value == null)
2317                 {
2318                     continue;
2319                 }
2320 
2321                 if (!"generator".equals(attval.value))
2322                 {
2323                     continue;
2324                 }
2325 
2326                 attval = node.getAttrByName("content");
2327 
2328                 if (attval == null || attval.value == null)
2329                 {
2330                     continue;
2331                 }
2332 
2333                 if (attval.value.indexOf("Microsoft") != -1)
2334                 {
2335                     return true;
2336                 }
2337             }
2338         }
2339 
2340         return false;
2341     }
2342 
2343     /***
2344      * Where appropriate move object elements from head to body.
2345      * @param lexer Lexer
2346      * @param html html node
2347      */
2348     static void bumpObject(Lexer lexer, Node html)
2349     {
2350         if (html == null)
2351         {
2352             return;
2353         }
2354 
2355         Node node, next, head = null, body = null;
2356         TagTable tt = lexer.configuration.tt;
2357         for (node = html.content; node != null; node = node.next)
2358         {
2359             if (node.tag == tt.tagHead)
2360             {
2361                 head = node;
2362             }
2363 
2364             if (node.tag == tt.tagBody)
2365             {
2366                 body = node;
2367             }
2368         }
2369 
2370         if (head != null && body != null)
2371         {
2372             for (node = head.content; node != null; node = next)
2373             {
2374                 next = node.next;
2375 
2376                 if (node.tag == tt.tagObject)
2377                 {
2378                     Node child;
2379                     boolean bump = false;
2380 
2381                     for (child = node.content; child != null; child = child.next)
2382                     {
2383                         // bump to body unless content is param
2384                         if ((child.type == Node.TEXT_NODE && !node.isBlank(lexer)) || child.tag != tt.tagParam)
2385                         {
2386                             bump = true;
2387                             break;
2388                         }
2389                     }
2390 
2391                     if (bump)
2392                     {
2393                         node.removeNode();
2394                         body.insertNodeAtStart(node);
2395                     }
2396                 }
2397             }
2398         }
2399     }
2400 
2401 }