1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 /**
57 * Clean up misuse of presentation markup. Filters from other formats such as Microsoft Word often make excessive use of
58 * presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is
59 * straight forward to transform this to use CSS. Some rules replace some of the children of an element by style
60 * properties on the element, e.g.
61 * <p>
62 * <b>... </b>
63 * </p>.
64 * <p style="font-weight: bold">
65 * ...
66 * </p>
67 * Such rules are applied to the element's content and then to the element itself until none of the rules more apply.
68 * Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules
69 * strip the element they apply to, replacing it by style properties on the contents, e.g. <dir>
70 * <li>
71 * <p>
72 * ...</li>
73 * </dir>.
74 * <p style="margin-left 1em">
75 * ... These rules are applied to an element before processing its content and replace the current element by the first
76 * element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class
77 * value and style rule in the document head. To support this, an association of styles and class names is built. A
78 * naive approach is to rely on string matching to test when two property lists are the same. A better approach would be
79 * to first sort the properties before matching.
80 * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
81 * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
82 * @author Fabrizio Giustina
83 * @version $Revision: 802 $ ($Author: fgiust $)
84 */
85 public class Clean
86 {
87
88 /**
89 * sequential number for generated css classes.
90 */
91 private int classNum = 1;
92
93 /**
94 * Tag table.
95 */
96 private TagTable tt;
97
98 /**
99 * Instantiates a new Clean.
100 * @param tagTable tag table instance
101 */
102 public Clean(TagTable tagTable)
103 {
104 this.tt = tagTable;
105 }
106
107 /**
108 * Insert a css style property.
109 * @param props StyleProp instance
110 * @param name property name
111 * @param value property value
112 * @return StyleProp containin the given property
113 */
114 private StyleProp insertProperty(StyleProp props, String name, String value)
115 {
116 StyleProp first, prev, prop;
117 int cmp;
118
119 prev = null;
120 first = props;
121
122 while (props != null)
123 {
124 cmp = props.name.compareTo(name);
125
126 if (cmp == 0)
127 {
128
129 return first;
130 }
131
132 if (cmp > 0)
133 {
134
135
136 prop = new StyleProp(name, value, props);
137
138 if (prev != null)
139 {
140 prev.next = prop;
141 }
142 else
143 {
144 first = prop;
145 }
146
147 return first;
148 }
149
150 prev = props;
151 props = props.next;
152 }
153
154 prop = new StyleProp(name, value, null);
155
156 if (prev != null)
157 {
158 prev.next = prop;
159 }
160 else
161 {
162 first = prop;
163 }
164
165 return first;
166 }
167
168 /**
169 * Create sorted linked list of properties from style string.
170 * @param prop StyleProp
171 * @param style style string
172 * @return StyleProp with given style
173 */
174 private StyleProp createProps(StyleProp prop, String style)
175 {
176 int nameEnd;
177 int valueEnd;
178 int valueStart = 0;
179 int nameStart = 0;
180 boolean more;
181
182 nameStart = 0;
183 while (nameStart < style.length())
184 {
185 while (nameStart < style.length() && style.charAt(nameStart) == ' ')
186 {
187 ++nameStart;
188 }
189
190 nameEnd = nameStart;
191
192 while (nameEnd < style.length())
193 {
194 if (style.charAt(nameEnd) == ':')
195 {
196 valueStart = nameEnd + 1;
197 break;
198 }
199
200 ++nameEnd;
201 }
202
203 if (nameEnd >= style.length() || style.charAt(nameEnd) != ':')
204 {
205 break;
206 }
207
208 while (valueStart < style.length() && style.charAt(valueStart) == ' ')
209 {
210 ++valueStart;
211 }
212
213 valueEnd = valueStart;
214 more = false;
215
216 while (valueEnd < style.length())
217 {
218 if (style.charAt(valueEnd) == ';')
219 {
220 more = true;
221 break;
222 }
223
224 ++valueEnd;
225 }
226
227 prop = insertProperty(prop, style.substring(nameStart, nameEnd), style.substring(valueStart, valueEnd));
228
229 if (more)
230 {
231 nameStart = valueEnd + 1;
232 continue;
233 }
234
235 break;
236 }
237
238 return prop;
239 }
240
241 /**
242 * Create a css property.
243 * @param props StyleProp
244 * @return css property as String
245 */
246 private String createPropString(StyleProp props)
247 {
248 String style = "";
249 int len;
250 StyleProp prop;
251
252
253 for (len = 0, prop = props; prop != null; prop = prop.next)
254 {
255 len += prop.name.length() + 2;
256 len += prop.value.length() + 2;
257 }
258
259 for (prop = props; prop != null; prop = prop.next)
260 {
261 style = style.concat(prop.name);
262 style = style.concat(": ");
263
264 style = style.concat(prop.value);
265
266 if (prop.next == null)
267 {
268 break;
269 }
270
271 style = style.concat("; ");
272 }
273
274 return style;
275 }
276
277 /**
278 * Creates a string with merged properties.
279 * @param style css style
280 * @param property css properties
281 * @return merged string
282 */
283 private String addProperty(String style, String property)
284 {
285 StyleProp prop;
286
287 prop = createProps(null, style);
288 prop = createProps(prop, property);
289 style = createPropString(prop);
290 return style;
291 }
292
293 /**
294 * Generates a new css class name.
295 * @param lexer Lexer
296 * @param tag Tag
297 * @return generated css class
298 */
299 private String gensymClass(Lexer lexer, String tag)
300 {
301 String str;
302
303 str = lexer.configuration.cssPrefix == null ? lexer.configuration.cssPrefix + this.classNum : "c"
304 + this.classNum;
305 this.classNum++;
306 return str;
307 }
308
309 /**
310 * Finds a css style.
311 * @param lexer Lexer
312 * @param tag tag name
313 * @param properties css properties
314 * @return style string
315 */
316 private String findStyle(Lexer lexer, String tag, String properties)
317 {
318 Style style;
319
320 for (style = lexer.styles; style != null; style = style.next)
321 {
322 if (style.tag.equals(tag) && style.properties.equals(properties))
323 {
324 return style.tagClass;
325 }
326 }
327
328 style = new Style(tag, gensymClass(lexer, tag), properties, lexer.styles);
329 lexer.styles = style;
330 return style.tagClass;
331 }
332
333 /**
334 * Find style attribute in node, and replace it by corresponding class attribute. Search for class in style
335 * dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute.
336 * @param lexer Lexer
337 * @param node node with a style attribute
338 */
339 private void style2Rule(Lexer lexer, Node node)
340 {
341 AttVal styleattr, classattr;
342 String classname;
343
344 styleattr = node.getAttrByName("style");
345
346 if (styleattr != null)
347 {
348 classname = findStyle(lexer, node.element, styleattr.value);
349 classattr = node.getAttrByName("class");
350
351
352
353 if (classattr != null)
354 {
355 classattr.value = classattr.value + " " + classname;
356 node.removeAttribute(styleattr);
357 }
358 else
359 {
360
361 styleattr.attribute = "class";
362 styleattr.value = classname;
363 }
364 }
365 }
366
367 /**
368 * Adds a css rule for color.
369 * @param lexer Lexer
370 * @param selector css selector
371 * @param color color value
372 */
373 private void addColorRule(Lexer lexer, String selector, String color)
374 {
375 if (color != null)
376 {
377 lexer.addStringLiteral(selector);
378 lexer.addStringLiteral(" { color: ");
379 lexer.addStringLiteral(color);
380 lexer.addStringLiteral(" }\n");
381 }
382 }
383
384 /**
385 * Move presentation attribs from body to style element.
386 *
387 * <pre>
388 * background="foo" . body { background-image: url(foo) }
389 * bgcolor="foo" . body { background-color: foo }
390 * text="foo" . body { color: foo }
391 * link="foo" . :link { color: foo }
392 * vlink="foo" . :visited { color: foo }
393 * alink="foo" . :active { color: foo }
394 * </pre>
395 *
396 * @param lexer Lexer
397 * @param body body node
398 */
399 private void cleanBodyAttrs(Lexer lexer, Node body)
400 {
401 AttVal attr;
402 String bgurl = null;
403 String bgcolor = null;
404 String color = null;
405
406 attr = body.getAttrByName("background");
407
408 if (attr != null)
409 {
410 bgurl = attr.value;
411 attr.value = null;
412 body.removeAttribute(attr);
413 }
414
415 attr = body.getAttrByName("bgcolor");
416
417 if (attr != null)
418 {
419 bgcolor = attr.value;
420 attr.value = null;
421 body.removeAttribute(attr);
422 }
423
424 attr = body.getAttrByName("text");
425
426 if (attr != null)
427 {
428 color = attr.value;
429 attr.value = null;
430 body.removeAttribute(attr);
431 }
432
433 if (bgurl != null || bgcolor != null || color != null)
434 {
435 lexer.addStringLiteral(" body {\n");
436
437 if (bgurl != null)
438 {
439 lexer.addStringLiteral(" background-image: url(");
440 lexer.addStringLiteral(bgurl);
441 lexer.addStringLiteral(");\n");
442 }
443
444 if (bgcolor != null)
445 {
446 lexer.addStringLiteral(" background-color: ");
447 lexer.addStringLiteral(bgcolor);
448 lexer.addStringLiteral(";\n");
449 }
450
451 if (color != null)
452 {
453 lexer.addStringLiteral(" color: ");
454 lexer.addStringLiteral(color);
455 lexer.addStringLiteral(";\n");
456 }
457
458 lexer.addStringLiteral(" }\n");
459 }
460
461 attr = body.getAttrByName("link");
462
463 if (attr != null)
464 {
465 addColorRule(lexer, " :link", attr.value);
466 body.removeAttribute(attr);
467 }
468
469 attr = body.getAttrByName("vlink");
470
471 if (attr != null)
472 {
473 addColorRule(lexer, " :visited", attr.value);
474 body.removeAttribute(attr);
475 }
476
477 attr = body.getAttrByName("alink");
478
479 if (attr != null)
480 {
481 addColorRule(lexer, " :active", attr.value);
482 body.removeAttribute(attr);
483 }
484 }
485
486 /**
487 * Check deprecated attributes in body tag.
488 * @param lexer Lexer
489 * @param doc document root node
490 * @return <code>true</code> is the body doesn't contain deprecated attributes, false otherwise.
491 */
492 private boolean niceBody(Lexer lexer, Node doc)
493 {
494 Node body = doc.findBody(lexer.configuration.tt);
495
496 if (body != null)
497 {
498 if (body.getAttrByName("background") != null
499 || body.getAttrByName("bgcolor") != null
500 || body.getAttrByName("text") != null
501 || body.getAttrByName("link") != null
502 || body.getAttrByName("vlink") != null
503 || body.getAttrByName("alink") != null)
504 {
505 lexer.badLayout |= Report.USING_BODY;
506 return false;
507 }
508 }
509
510 return true;
511 }
512
513 /**
514 * Create style element using rules from dictionary.
515 * @param lexer Lexer
516 * @param doc root node
517 */
518 private void createStyleElement(Lexer lexer, Node doc)
519 {
520 Node node, head, body;
521 Style style;
522 AttVal av;
523
524 if (lexer.styles == null && niceBody(lexer, doc))
525 {
526 return;
527 }
528
529 node = lexer.newNode(Node.START_TAG, null, 0, 0, "style");
530 node.implicit = true;
531
532
533 av = new AttVal(null, null, '"', "type", "text/css");
534 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
535 node.attributes = av;
536
537 body = doc.findBody(lexer.configuration.tt);
538
539 lexer.txtstart = lexer.lexsize;
540
541 if (body != null)
542 {
543 cleanBodyAttrs(lexer, body);
544 }
545
546 for (style = lexer.styles; style != null; style = style.next)
547 {
548 lexer.addCharToLexer(' ');
549 lexer.addStringLiteral(style.tag);
550 lexer.addCharToLexer('.');
551 lexer.addStringLiteral(style.tagClass);
552 lexer.addCharToLexer(' ');
553 lexer.addCharToLexer('{');
554 lexer.addStringLiteral(style.properties);
555 lexer.addCharToLexer('}');
556 lexer.addCharToLexer('\n');
557 }
558
559 lexer.txtend = lexer.lexsize;
560
561 node.insertNodeAtEnd(lexer.newNode(Node.TEXT_NODE, lexer.lexbuf, lexer.txtstart, lexer.txtend));
562
563
564
565
566 head = doc.findHEAD(lexer.configuration.tt);
567
568 if (head != null)
569 {
570 head.insertNodeAtEnd(node);
571 }
572 }
573
574 /**
575 * Ensure bidirectional links are consistent.
576 * @param node root node
577 */
578 private void fixNodeLinks(Node node)
579 {
580 Node child;
581
582 if (node.prev != null)
583 {
584 node.prev.next = node;
585 }
586 else
587 {
588 node.parent.content = node;
589 }
590
591 if (node.next != null)
592 {
593 node.next.prev = node;
594 }
595 else
596 {
597 node.parent.last = node;
598 }
599
600 for (child = node.content; child != null; child = child.next)
601 {
602 child.parent = node;
603 }
604 }
605
606 /**
607 * Used to strip child of node when the node has one and only one child.
608 * @param node parent node
609 */
610 private void stripOnlyChild(Node node)
611 {
612 Node child;
613
614 child = node.content;
615 node.content = child.content;
616 node.last = child.last;
617 child.content = null;
618
619 for (child = node.content; child != null; child = child.next)
620 {
621 child.parent = node;
622 }
623 }
624
625 /**
626 * Used to strip font start and end tags.
627 * @param element original node
628 * @param pnode passed in as array to allow modification. pnode[0] will contain the final node
629 * @todo remove the pnode parameter and make it a return value
630 */
631 private void discardContainer(Node element, Node[] pnode)
632 {
633 Node node;
634 Node parent = element.parent;
635
636 if (element.content != null)
637 {
638 element.last.next = element.next;
639
640 if (element.next != null)
641 {
642 element.next.prev = element.last;
643 element.last.next = element.next;
644 }
645 else
646 {
647 parent.last = element.last;
648 }
649
650 if (element.prev != null)
651 {
652 element.content.prev = element.prev;
653 element.prev.next = element.content;
654 }
655 else
656 {
657 parent.content = element.content;
658 }
659
660 for (node = element.content; node != null; node = node.next)
661 {
662 node.parent = parent;
663 }
664
665 pnode[0] = element.content;
666 }
667 else
668 {
669 if (element.next != null)
670 {
671 element.next.prev = element.prev;
672 }
673 else
674 {
675 parent.last = element.prev;
676 }
677
678 if (element.prev != null)
679 {
680 element.prev.next = element.next;
681 }
682 else
683 {
684 parent.content = element.next;
685 }
686
687 pnode[0] = element.next;
688 }
689
690 element.next = null;
691 element.content = null;
692 }
693
694 /**
695 * Add style property to element, creating style attribute as needed and adding ; delimiter.
696 * @param node node
697 * @param property property added to node
698 */
699 private void addStyleProperty(Node node, String property)
700 {
701 AttVal av;
702
703 for (av = node.attributes; av != null; av = av.next)
704 {
705 if (av.attribute.equals("style"))
706 {
707 break;
708 }
709 }
710
711
712
713 if (av != null)
714 {
715 String s;
716
717 s = addProperty(av.value, property);
718 av.value = s;
719 }
720 else
721 {
722
723 av = new AttVal(node.attributes, null, '"', "style", property);
724 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
725 node.attributes = av;
726 }
727 }
728
729 /**
730 * Create new string that consists of the combined style properties in s1 and s2. To merge property lists, we build
731 * a linked list of property/values and insert properties into the list in order, merging values for the same
732 * property name.
733 * @param s1 first property
734 * @param s2 second property
735 * @return merged properties
736 */
737 private String mergeProperties(String s1, String s2)
738 {
739 String s;
740 StyleProp prop;
741
742 prop = createProps(null, s1);
743 prop = createProps(prop, s2);
744 s = createPropString(prop);
745 return s;
746 }
747
748 /**
749 * Merge class attributes from 2 nodes.
750 * @param node Node
751 * @param child Child node
752 */
753 private void mergeClasses(Node node, Node child)
754 {
755 AttVal av;
756 String s1, s2, names;
757
758 for (s2 = null, av = child.attributes; av != null; av = av.next)
759 {
760 if ("class".equals(av.attribute))
761 {
762 s2 = av.value;
763 break;
764 }
765 }
766
767 for (s1 = null, av = node.attributes; av != null; av = av.next)
768 {
769 if ("class".equals(av.attribute))
770 {
771 s1 = av.value;
772 break;
773 }
774 }
775
776 if (s1 != null)
777 {
778 if (s2 != null)
779 {
780 names = s1 + ' ' + s2;
781 av.value = names;
782 }
783 }
784 else if (s2 != null)
785 {
786 av = new AttVal(node.attributes, null, '"', "class", s2);
787 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
788 node.attributes = av;
789 }
790 }
791
792 /**
793 * Merge style from 2 nodes.
794 * @param node Node
795 * @param child Child node
796 */
797 private void mergeStyles(Node node, Node child)
798 {
799 AttVal av;
800 String s1, s2, style;
801
802
803
804 mergeClasses(node, child);
805
806 for (s2 = null, av = child.attributes; av != null; av = av.next)
807 {
808 if (av.attribute.equals("style"))
809 {
810 s2 = av.value;
811 break;
812 }
813 }
814
815 for (s1 = null, av = node.attributes; av != null; av = av.next)
816 {
817 if (av.attribute.equals("style"))
818 {
819 s1 = av.value;
820 break;
821 }
822 }
823
824 if (s1 != null)
825 {
826 if (s2 != null)
827 {
828 style = mergeProperties(s1, s2);
829 av.value = style;
830 }
831 }
832 else if (s2 != null)
833 {
834 av = new AttVal(node.attributes, null, '"', "style", s2);
835 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
836 node.attributes = av;
837 }
838 }
839
840 /**
841 * Map a % font size to a named font size.
842 * @param size size in %
843 * @return font size name
844 */
845 private String fontSize2Name(String size)
846 {
847 String[] sizes = {"60%", "70%", "80%", null, "120%", "150%", "200%"};
848 String buf;
849
850 if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6')
851 {
852 int n = size.charAt(0) - '0';
853 return sizes[n];
854 }
855
856 if (size.length() > 0 && size.charAt(0) == '-')
857 {
858 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
859 {
860 int n = size.charAt(1) - '0';
861 double x;
862
863 for (x = 1.0; n > 0; --n)
864 {
865 x *= 0.8;
866 }
867
868 x *= 100.0;
869 buf = "" + (int) x + "%";
870
871 return buf;
872 }
873
874 return "smaller";
875 }
876
877 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
878 {
879 int n = size.charAt(1) - '0';
880 double x;
881
882 for (x = 1.0; n > 0; --n)
883 {
884 x *= 1.2;
885 }
886
887 x *= 100.0;
888 buf = "" + (int) x + "%";
889
890 return buf;
891 }
892
893 return "larger";
894 }
895
896 /**
897 * Adds a font-family style.
898 * @param node Node
899 * @param face font face
900 */
901 private void addFontFace(Node node, String face)
902 {
903 addStyleProperty(node, "font-family: " + face);
904 }
905
906 /**
907 * Adds a font size style.
908 * @param node Node
909 * @param size font size
910 */
911 private void addFontSize(Node node, String size)
912 {
913 if (size == null)
914 {
915 return;
916 }
917
918 if ("6".equals(size) && node.tag == this.tt.tagP)
919 {
920 node.element = "h1";
921 this.tt.findTag(node);
922 return;
923 }
924
925 if ("5".equals(size) && node.tag == this.tt.tagP)
926 {
927 node.element = "h2";
928 this.tt.findTag(node);
929 return;
930 }
931
932 if ("4".equals(size) && node.tag == this.tt.tagP)
933 {
934 node.element = "h3";
935 this.tt.findTag(node);
936 return;
937 }
938
939 String value = fontSize2Name(size);
940
941 if (value != null)
942 {
943 addStyleProperty(node, "font-size: " + value);
944 }
945 }
946
947 /**
948 * Adds a font color style.
949 * @param node Node
950 * @param color color value
951 */
952 private void addFontColor(Node node, String color)
953 {
954 addStyleProperty(node, "color: " + color);
955 }
956
957 /**
958 * Adds an align style.
959 * @param node Node
960 * @param align align value
961 */
962 private void addAlign(Node node, String align)
963 {
964
965 addStyleProperty(node, "text-align: " + align.toLowerCase());
966 }
967
968 /**
969 * Add style properties to node corresponding to the font face, size and color attributes.
970 * @param node font tag
971 * @param av attribute list for node
972 */
973 private void addFontStyles(Node node, AttVal av)
974 {
975 while (av != null)
976 {
977 if (av.attribute.equals("face"))
978 {
979 addFontFace(node, av.value);
980 }
981 else if (av.attribute.equals("size"))
982 {
983 addFontSize(node, av.value);
984 }
985 else if (av.attribute.equals("color"))
986 {
987 addFontColor(node, av.value);
988 }
989
990 av = av.next;
991 }
992 }
993
994 /**
995 * Symptom: <code><p align=center></code>. Action: <code><p style="text-align: center"></code>.
996 * @param lexer Lexer
997 * @param node node with center attribute. Will be modified to use css style.
998 */
999 private void textAlign(Lexer lexer, Node node)
1000 {
1001 AttVal av, prev;
1002
1003 prev = null;
1004
1005 for (av = node.attributes; av != null; av = av.next)
1006 {
1007 if (av.attribute.equals("align"))
1008 {
1009 if (prev != null)
1010 {
1011 prev.next = av.next;
1012 }
1013 else
1014 {
1015 node.attributes = av.next;
1016 }
1017
1018 if (av.value != null)
1019 {
1020 addAlign(node, av.value);
1021 }
1022
1023 break;
1024 }
1025
1026 prev = av;
1027 }
1028 }
1029
1030 /**
1031 * Symptom: <code><dir><li></code> where <code><li></code> is only child. Action: coerce
1032 * <code><dir> <li></code> to <code><div></code> with indent. The clean up rules use the pnode argument
1033 * to return the next node when the original node has been deleted.
1034 * @param lexer Lexer
1035 * @param node dir tag
1036 * @return <code>true</code> if a dir tag has been coerced to a div
1037 */
1038 private boolean dir2Div(Lexer lexer, Node node)
1039 {
1040 Node child;
1041
1042 if (node.tag == this.tt.tagDir || node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1043 {
1044 child = node.content;
1045
1046 if (child == null)
1047 {
1048 return false;
1049 }
1050
1051
1052 if (child.next != null)
1053 {
1054 return false;
1055 }
1056
1057 if (child.tag != this.tt.tagLi)
1058 {
1059 return false;
1060 }
1061
1062 if (!child.implicit)
1063 {
1064 return false;
1065 }
1066
1067
1068 node.tag = this.tt.tagDiv;
1069 node.element = "div";
1070 addStyleProperty(node, "margin-left: 2em");
1071 stripOnlyChild(node);
1072 return true;
1073 }
1074
1075 return false;
1076 }
1077
1078 /**
1079 * Symptom:
1080 *
1081 * <pre>
1082 * <center>
1083 * </pre>.
1084 * <p>
1085 * Action: replace <code><center></code> by <code><div style="text-align: center"></code>
1086 * </p>
1087 * @param lexer Lexer
1088 * @param node center tag
1089 * @param pnode pnode[0] is the same as node, passed in as an array to allow modification
1090 * @return <code>true</code> if a center tag has been replaced by a div
1091 */
1092 private boolean center2Div(Lexer lexer, Node node, Node[] pnode)
1093 {
1094 if (node.tag == this.tt.tagCenter)
1095 {
1096 if (lexer.configuration.dropFontTags)
1097 {
1098 if (node.content != null)
1099 {
1100 Node last = node.last;
1101 Node parent = node.parent;
1102
1103 discardContainer(node, pnode);
1104
1105 node = lexer.inferredTag("br");
1106
1107 if (last.next != null)
1108 {
1109 last.next.prev = node;
1110 }
1111
1112 node.next = last.next;
1113 last.next = node;
1114 node.prev = last;
1115
1116 if (parent.last == last)
1117 {
1118 parent.last = node;
1119 }
1120
1121 node.parent = parent;
1122 }
1123 else
1124 {
1125 Node prev = node.prev;
1126 Node next = node.next;
1127 Node parent = node.parent;
1128 discardContainer(node, pnode);
1129
1130 node = lexer.inferredTag("br");
1131 node.next = next;
1132 node.prev = prev;
1133 node.parent = parent;
1134
1135 if (next != null)
1136 {
1137 next.prev = node;
1138 }
1139 else
1140 {
1141 parent.last = node;
1142 }
1143
1144 if (prev != null)
1145 {
1146 prev.next = node;
1147 }
1148 else
1149 {
1150 parent.content = node;
1151 }
1152 }
1153
1154 return true;
1155 }
1156 node.tag = this.tt.tagDiv;
1157 node.element = "div";
1158 addStyleProperty(node, "text-align: center");
1159 return true;
1160 }
1161
1162 return false;
1163 }
1164
1165 /**
1166 * Symptom: <code><div><div>...</div></div></code> Action: merge the two divs. This is useful after
1167 * nested <dir>s used by Word for indenting have been converted to <div>s.
1168 * @param lexer Lexer
1169 * @param node first div
1170 * @return true if the divs have been merged
1171 */
1172 private boolean mergeDivs(Lexer lexer, Node node)
1173 {
1174 Node child;
1175
1176 if (node.tag != this.tt.tagDiv)
1177 {
1178 return false;
1179 }
1180
1181 child = node.content;
1182
1183 if (child == null)
1184 {
1185 return false;
1186 }
1187
1188 if (child.tag != this.tt.tagDiv)
1189 {
1190 return false;
1191 }
1192
1193 if (child.next != null)
1194 {
1195 return false;
1196 }
1197
1198 mergeStyles(node, child);
1199 stripOnlyChild(node);
1200 return true;
1201 }
1202
1203 /**
1204 * Symptom:
1205 * <ul>
1206 * <li>
1207 * <ul>
1208 * ...
1209 * </ul>
1210 * </li>
1211 * </ul>
1212 * Action: discard outer list.
1213 * @param lexer Lexer
1214 * @param node Node
1215 * @param pnode passed in as array to allow modifications.
1216 * @return <code>true</code> if nested lists have been found and replaced
1217 */
1218 private boolean nestedList(Lexer lexer, Node node, Node[] pnode)
1219 {
1220 Node child, list;
1221
1222 if (node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1223 {
1224 child = node.content;
1225
1226 if (child == null)
1227 {
1228 return false;
1229 }
1230
1231
1232
1233 if (child.next != null)
1234 {
1235 return false;
1236 }
1237
1238 list = child.content;
1239
1240 if (list == null)
1241 {
1242 return false;
1243 }
1244
1245 if (list.tag != node.tag)
1246 {
1247 return false;
1248 }
1249
1250 pnode[0] = list;
1251
1252
1253 list.prev = node.prev;
1254 list.next = node.next;
1255 list.parent = node.parent;
1256 fixNodeLinks(list);
1257
1258
1259
1260 child.content = null;
1261 node.content = null;
1262 node.next = null;
1263 node = null;
1264
1265
1266
1267 if (list.prev != null)
1268 {
1269 if (list.prev.tag == this.tt.tagUl || list.prev.tag == this.tt.tagOl)
1270 {
1271
1272 node = list;
1273 list = node.prev;
1274
1275 list.next = node.next;
1276
1277 if (list.next != null)
1278 {
1279 list.next.prev = list;
1280 }
1281
1282 child = list.last;
1283
1284 node.parent = child;
1285 node.next = null;
1286 node.prev = child.last;
1287 fixNodeLinks(node);
1288 cleanNode(lexer, node);
1289 }
1290 }
1291
1292 return true;
1293 }
1294
1295 return false;
1296 }
1297
1298 /**
1299 * Symptom: the only child of a block-level element is a presentation element such as B, I or FONT. Action: add
1300 * style "font-weight: bold" to the block and strip the <b>element, leaving its children. example:
1301 *
1302 * <pre>
1303 * <p>
1304 * <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1305 * </p>
1306 * </pre>
1307 *
1308 * becomes:
1309 *
1310 * <pre>
1311 * <p style="font-weight: bold; font-family: Arial; font-size: 6">
1312 * Draft Recommended Practice
1313 * </p>
1314 * </pre>
1315 *
1316 * <p>
1317 * This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator
1318 * 4, this isn't done for the elements: caption, tr and table
1319 * </p>
1320 * @param lexer Lexer
1321 * @param node parent node
1322 * @return <code>true</code> if the child node has been removed
1323 */
1324 private boolean blockStyle(Lexer lexer, Node node)
1325 {
1326 Node child;
1327
1328 if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1329 {
1330 if (node.tag != this.tt.tagTable && node.tag != this.tt.tagTr && node.tag != this.tt.tagLi)
1331 {
1332
1333 if (node.tag != this.tt.tagCaption)
1334 {
1335 textAlign(lexer, node);
1336 }
1337
1338 child = node.content;
1339
1340 if (child == null)
1341 {
1342 return false;
1343 }
1344
1345
1346 if (child.next != null)
1347 {
1348 return false;
1349 }
1350
1351 if (child.tag == this.tt.tagB)
1352 {
1353 mergeStyles(node, child);
1354 addStyleProperty(node, "font-weight: bold");
1355 stripOnlyChild(node);
1356 return true;
1357 }
1358
1359 if (child.tag == this.tt.tagI)
1360 {
1361 mergeStyles(node, child);
1362 addStyleProperty(node, "font-style: italic");
1363 stripOnlyChild(node);
1364 return true;
1365 }
1366
1367 if (child.tag == this.tt.tagFont)
1368 {
1369 mergeStyles(node, child);
1370 addFontStyles(node, child.attributes);
1371 stripOnlyChild(node);
1372 return true;
1373 }
1374 }
1375 }
1376
1377 return false;
1378 }
1379
1380 /**
1381 * If the node has only one b, i, or font child remove the child node and add the appropriate style attributes to
1382 * parent.
1383 * @param lexer Lexer
1384 * @param node parent node
1385 * @param pnode passed as an array to allow modifications
1386 * @return <code>true</code> if child node has been stripped, replaced by style attributes.
1387 */
1388 private boolean inlineStyle(Lexer lexer, Node node, Node[] pnode)
1389 {
1390 Node child;
1391
1392 if (node.tag != this.tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0)
1393 {
1394 child = node.content;
1395
1396 if (child == null)
1397 {
1398 return false;
1399 }
1400
1401
1402 if (child.next != null)
1403 {
1404 return false;
1405 }
1406
1407 if (child.tag == this.tt.tagB && lexer.configuration.logicalEmphasis)
1408 {
1409 mergeStyles(node, child);
1410 addStyleProperty(node, "font-weight: bold");
1411 stripOnlyChild(node);
1412 return true;
1413 }
1414
1415 if (child.tag == this.tt.tagI && lexer.configuration.logicalEmphasis)
1416 {
1417 mergeStyles(node, child);
1418 addStyleProperty(node, "font-style: italic");
1419 stripOnlyChild(node);
1420 return true;
1421 }
1422
1423 if (child.tag == this.tt.tagFont)
1424 {
1425 mergeStyles(node, child);
1426 addFontStyles(node, child.attributes);
1427 stripOnlyChild(node);
1428 return true;
1429 }
1430 }
1431
1432 return false;
1433 }
1434
1435 /**
1436 * Replace font elements by span elements, deleting the font element's attributes and replacing them by a single
1437 * style attribute.
1438 * @param lexer Lexer
1439 * @param node font tag
1440 * @param pnode passed as an array to allow modifications
1441 * @return <code>true</code> if a font tag has been dropped and replaced by style attributes
1442 */
1443 private boolean font2Span(Lexer lexer, Node node, Node[] pnode)
1444 {
1445 AttVal av, style, next;
1446
1447 if (node.tag == this.tt.tagFont)
1448 {
1449 if (lexer.configuration.dropFontTags)
1450 {
1451 discardContainer(node, pnode);
1452 return false;
1453 }
1454
1455
1456 if (node.parent.content == node && node.next == null)
1457 {
1458 return false;
1459 }
1460
1461 addFontStyles(node, node.attributes);
1462
1463
1464 av = node.attributes;
1465 style = null;
1466
1467 while (av != null)
1468 {
1469 next = av.next;
1470
1471 if (av.attribute.equals("style"))
1472 {
1473 av.next = null;
1474 style = av;
1475 }
1476
1477 av = next;
1478 }
1479
1480 node.attributes = style;
1481
1482 node.tag = this.tt.tagSpan;
1483 node.element = "span";
1484
1485 return true;
1486 }
1487
1488 return false;
1489 }
1490
1491 /**
1492 * Applies all matching rules to a node.
1493 * @param lexer Lexer
1494 * @param node original node
1495 * @return cleaned up node
1496 */
1497 private Node cleanNode(Lexer lexer, Node node)
1498 {
1499 Node next = null;
1500 Node[] o = new Node[1];
1501 boolean b = false;
1502
1503 for (next = node; node != null && node.isElement(); node = next)
1504 {
1505 o[0] = next;
1506
1507 b = dir2Div(lexer, node);
1508 next = o[0];
1509 if (b)
1510 {
1511 continue;
1512 }
1513
1514
1515
1516 b = nestedList(lexer, node, o);
1517 next = o[0];
1518 if (b)
1519 {
1520 return next;
1521 }
1522
1523 b = center2Div(lexer, node, o);
1524 next = o[0];
1525 if (b)
1526 {
1527 continue;
1528 }
1529
1530 b = mergeDivs(lexer, node);
1531 next = o[0];
1532 if (b)
1533 {
1534 continue;
1535 }
1536
1537 b = blockStyle(lexer, node);
1538 next = o[0];
1539 if (b)
1540 {
1541 continue;
1542 }
1543
1544 b = inlineStyle(lexer, node, o);
1545 next = o[0];
1546 if (b)
1547 {
1548 continue;
1549 }
1550
1551 b = font2Span(lexer, node, o);
1552 next = o[0];
1553 if (b)
1554 {
1555 continue;
1556 }
1557
1558 break;
1559 }
1560
1561 return next;
1562 }
1563
1564 /**
1565 * Special case: if the current node is destroyed by CleanNode() lower in the tree, this node and its parent no
1566 * longer exist. So we must jump back up the CreateStyleProperties() call stack until we have a valid node
1567 * reference.
1568 * @param lexer Lexer
1569 * @param node Node
1570 * @param prepl passed in as array to allow modifications
1571 * @return cleaned Node
1572 */
1573 private Node createStyleProperties(Lexer lexer, Node node, Node[] prepl)
1574 {
1575 Node child = node.content;
1576
1577 if (child != null)
1578 {
1579 Node[] repl = new Node[1];
1580 repl[0] = node;
1581 while (child != null)
1582 {
1583 child = createStyleProperties(lexer, child, repl);
1584 if (repl[0] != node)
1585 {
1586 return repl[0];
1587 }
1588 if (child != null)
1589 {
1590 child = child.next;
1591 }
1592 }
1593 }
1594
1595 return cleanNode(lexer, node);
1596 }
1597
1598 /**
1599 * Find style attribute in node content, and replace it by corresponding class attribute.
1600 * @param lexer Lexer
1601 * @param node parent node
1602 */
1603 private void defineStyleRules(Lexer lexer, Node node)
1604 {
1605 Node child;
1606
1607 if (node.content != null)
1608 {
1609 child = node.content;
1610 while (child != null)
1611 {
1612 defineStyleRules(lexer, child);
1613 child = child.next;
1614 }
1615 }
1616
1617 style2Rule(lexer, node);
1618 }
1619
1620 /**
1621 * Clean an html tree.
1622 * @param lexer Lexer
1623 * @param doc root node
1624 */
1625 public void cleanTree(Lexer lexer, Node doc)
1626 {
1627 Node[] repl = new Node[1];
1628 repl[0] = doc;
1629 doc = createStyleProperties(lexer, doc, repl);
1630
1631 if (!lexer.configuration.makeClean)
1632 {
1633 defineStyleRules(lexer, doc);
1634 createStyleElement(lexer, doc);
1635 }
1636 }
1637
1638 /**
1639 * simplifies <b><b>... </b> ... </b> etc.
1640 * @param node root Node
1641 */
1642 public void nestedEmphasis(Node node)
1643 {
1644 Node[] o = new Node[1];
1645 Node next;
1646
1647 while (node != null)
1648 {
1649 next = node.next;
1650
1651 if ((node.tag == this.tt.tagB || node.tag == this.tt.tagI)
1652 && node.parent != null
1653 && node.parent.tag == node.tag)
1654 {
1655
1656 o[0] = next;
1657 discardContainer(node, o);
1658 next = o[0];
1659 node = next;
1660 continue;
1661 }
1662
1663 if (node.content != null)
1664 {
1665 nestedEmphasis(node.content);
1666 }
1667
1668 node = next;
1669 }
1670 }
1671
1672 /**
1673 * Replace i by em and b by strong.
1674 * @param node root Node
1675 */
1676 public void emFromI(Node node)
1677 {
1678 while (node != null)
1679 {
1680 if (node.tag == this.tt.tagI)
1681 {
1682 node.element = this.tt.tagEm.name;
1683 node.tag = this.tt.tagEm;
1684 }
1685 else if (node.tag == this.tt.tagB)
1686 {
1687 node.element = this.tt.tagStrong.name;
1688 node.tag = this.tt.tagStrong;
1689 }
1690
1691 if (node.content != null)
1692 {
1693 emFromI(node.content);
1694 }
1695
1696 node = node.next;
1697 }
1698 }
1699
1700 /**
1701 * Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single
1702 * implicit li. This is recursively replaced by an implicit blockquote.
1703 * @param node root Node
1704 */
1705 public void list2BQ(Node node)
1706 {
1707 while (node != null)
1708 {
1709 if (node.content != null)
1710 {
1711 list2BQ(node.content);
1712 }
1713
1714 if (node.tag != null
1715 && node.tag.getParser() == ParserImpl.LIST
1716 && node.hasOneChild()
1717 && node.content.implicit)
1718 {
1719 stripOnlyChild(node);
1720 node.element = this.tt.tagBlockquote.name;
1721 node.tag = this.tt.tagBlockquote;
1722 node.implicit = true;
1723 }
1724
1725 node = node.next;
1726 }
1727 }
1728
1729 /**
1730 * Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with
1731 * the indent set to match the nesting depth.
1732 * @param node root Node
1733 */
1734 public void bQ2Div(Node node)
1735 {
1736 int indent;
1737 String indentBuf;
1738 AttVal attval;
1739
1740 while (node != null)
1741 {
1742 if (node.tag == this.tt.tagBlockquote && node.implicit)
1743 {
1744 indent = 1;
1745
1746 while (node.hasOneChild() && node.content.tag == this.tt.tagBlockquote && node.implicit)
1747 {
1748 ++indent;
1749 stripOnlyChild(node);
1750 }
1751
1752 if (node.content != null)
1753 {
1754 bQ2Div(node.content);
1755 }
1756
1757 indentBuf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1758
1759 node.element = this.tt.tagDiv.name;
1760 node.tag = this.tt.tagDiv;
1761
1762 attval = node.getAttrByName("style");
1763
1764 if (attval != null && attval.value != null)
1765 {
1766 attval.value = indentBuf + "; " + attval.value;
1767 }
1768 else
1769 {
1770 node.addAttribute("style", indentBuf);
1771 }
1772 }
1773 else if (node.content != null)
1774 {
1775 bQ2Div(node.content);
1776 }
1777
1778 node = node.next;
1779 }
1780 }
1781
1782 /**
1783 * Find the enclosing table cell for the given node.
1784 * @param node Node
1785 * @return enclosing cell node
1786 */
1787 Node findEnclosingCell(Node node)
1788 {
1789 Node check;
1790
1791 for (check = node; check != null; check = check.parent)
1792 {
1793 if (check.tag == tt.tagTd)
1794 {
1795 return check;
1796 }
1797 }
1798 return null;
1799 }
1800
1801 /**
1802 * node is <code><![if ...]></code> prune up to <code><![endif]></code>.
1803 * @param lexer Lexer
1804 * @param node Node
1805 * @return cleaned up Node
1806 */
1807 public Node pruneSection(Lexer lexer, Node node)
1808 {
1809 for (;;)
1810 {
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827 node = Node.discardElement(node);
1828
1829 if (node == null)
1830 {
1831 return null;
1832 }
1833
1834 if (node.type == Node.SECTION_TAG)
1835 {
1836 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if"))
1837 {
1838 node = pruneSection(lexer, node);
1839 continue;
1840 }
1841
1842 if ((TidyUtils.getString(node.textarray, node.start, 5)).equals("endif"))
1843 {
1844 node = Node.discardElement(node);
1845 break;
1846 }
1847 }
1848 }
1849
1850 return node;
1851 }
1852
1853 /**
1854 * Drop if/endif sections inserted by word2000.
1855 * @param lexer Lexer
1856 * @param node Node root node
1857 */
1858 public void dropSections(Lexer lexer, Node node)
1859 {
1860 while (node != null)
1861 {
1862 if (node.type == Node.SECTION_TAG)
1863 {
1864
1865 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if")
1866 && (!(TidyUtils.getString(node.textarray, node.start, 7)).equals("if !vml")))
1867
1868 {
1869 node = pruneSection(lexer, node);
1870 continue;
1871 }
1872
1873
1874 node = Node.discardElement(node);
1875 continue;
1876 }
1877
1878 if (node.content != null)
1879 {
1880 dropSections(lexer, node.content);
1881 }
1882
1883 node = node.next;
1884 }
1885 }
1886
1887 /**
1888 * Remove word2000 attributes from node.
1889 * @param node node to cleanup
1890 */
1891 public void purgeWord2000Attributes(Node node)
1892 {
1893 AttVal attr = null;
1894 AttVal next = null;
1895 AttVal prev = null;
1896
1897 for (attr = node.attributes; attr != null; attr = next)
1898 {
1899 next = attr.next;
1900
1901
1902
1903 if (attr.attribute != null && attr.value != null && attr.attribute.equals("class"))
1904 {
1905 if (attr.value.equals("Code") || !attr.value.startsWith("Mso"))
1906 {
1907 prev = attr;
1908 continue;
1909 }
1910 }
1911
1912 if (attr.attribute != null
1913 && (attr.attribute.equals("class")
1914 || attr.attribute.equals("style")
1915 || attr.attribute.equals("lang")
1916 || attr.attribute.startsWith("x:") || ((attr.attribute.equals("height") || attr.attribute
1917 .equals("width")) &&
1918 (node.tag == this.tt.tagTd || node.tag == this.tt.tagTr || node.tag == this.tt.tagTh))))
1919 {
1920 if (prev != null)
1921 {
1922 prev.next = next;
1923 }
1924 else
1925 {
1926 node.attributes = next;
1927 }
1928
1929 }
1930 else
1931 {
1932 prev = attr;
1933 }
1934 }
1935 }
1936
1937 /**
1938 * Word2000 uses span excessively, so we strip span out.
1939 * @param lexer Lexer
1940 * @param span Node span
1941 * @return cleaned node
1942 */
1943 public Node stripSpan(Lexer lexer, Node span)
1944 {
1945 Node node;
1946 Node prev = null;
1947 Node content;
1948
1949
1950
1951
1952 cleanWord2000(lexer, span.content);
1953 content = span.content;
1954
1955 if (span.prev != null)
1956 {
1957 prev = span.prev;
1958 }
1959 else if (content != null)
1960 {
1961 node = content;
1962 content = content.next;
1963 node.removeNode();
1964 Node.insertNodeBeforeElement(span, node);
1965 prev = node;
1966 }
1967
1968 while (content != null)
1969 {
1970 node = content;
1971 content = content.next;
1972 node.removeNode();
1973 prev.insertNodeAfterElement(node);
1974 prev = node;
1975 }
1976
1977 if (span.next == null)
1978 {
1979 span.parent.last = prev;
1980 }
1981
1982 node = span.next;
1983 span.content = null;
1984 Node.discardElement(span);
1985 return node;
1986 }
1987
1988 /**
1989 * Map non-breaking spaces to regular spaces.
1990 * @param lexer Lexer
1991 * @param node Node
1992 */
1993 private void normalizeSpaces(Lexer lexer, Node node)
1994 {
1995 while (node != null)
1996 {
1997 if (node.content != null)
1998 {
1999 normalizeSpaces(lexer, node.content);
2000 }
2001
2002 if (node.type == Node.TEXT_NODE)
2003 {
2004 int i;
2005 int[] c = new int[1];
2006 int p = node.start;
2007
2008 for (i = node.start; i < node.end; ++i)
2009 {
2010 c[0] = node.textarray[i];
2011
2012
2013 if (c[0] > 0x7F)
2014 {
2015 i += PPrint.getUTF8(node.textarray, i, c);
2016 }
2017
2018 if (c[0] == 160)
2019 {
2020 c[0] = ' ';
2021 }
2022
2023 p = PPrint.putUTF8(node.textarray, p, c[0]);
2024 }
2025 }
2026
2027 node = node.next;
2028 }
2029 }
2030
2031 /**
2032 * Used to hunt for hidden preformatted sections.
2033 * @param node checked node
2034 * @return <code>true</code> if the node has a "margin-top: 0" or "margin-bottom: 0" style
2035 */
2036 boolean noMargins(Node node)
2037 {
2038 AttVal attval = node.getAttrByName("style");
2039
2040 if (attval == null || attval.value == null)
2041 {
2042 return false;
2043 }
2044
2045
2046 if (attval.value.indexOf("margin-top: 0") == -1)
2047 {
2048 return false;
2049 }
2050
2051
2052 if (attval.value.indexOf("margin-bottom: 0") == -1)
2053 {
2054 return false;
2055 }
2056
2057 return true;
2058 }
2059
2060 /**
2061 * Does element have a single space as its content?
2062 * @param lexer Lexer
2063 * @param node checked node
2064 * @return <code>true</code> if the element has a single space as its content
2065 */
2066 boolean singleSpace(Lexer lexer, Node node)
2067 {
2068 if (node.content != null)
2069 {
2070 node = node.content;
2071
2072 if (node.next != null)
2073 {
2074 return false;
2075 }
2076
2077 if (node.type != Node.TEXT_NODE)
2078 {
2079 return false;
2080 }
2081
2082 if (((node.end - node.start) == 1) && lexer.lexbuf[node.start] == ' ')
2083 {
2084 return true;
2085 }
2086
2087 if ((node.end - node.start) == 2)
2088 {
2089 int[] c = new int[1];
2090
2091 PPrint.getUTF8(lexer.lexbuf, node.start, c);
2092
2093 if (c[0] == 160)
2094 {
2095 return true;
2096 }
2097 }
2098 }
2099
2100 return false;
2101 }
2102
2103 /**
2104 * This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It
2105 * doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags,
2106 * such as o:p which needs to be declared as inline.
2107 * @param lexer Lexer
2108 * @param node node to clean up
2109 */
2110 public void cleanWord2000(Lexer lexer, Node node)
2111 {
2112
2113 Node list = null;
2114
2115 while (node != null)
2116 {
2117
2118
2119 if (node.tag == tt.tagHtml)
2120 {
2121
2122 if ((node.getAttrByName("xmlns:o") == null))
2123 {
2124 return;
2125 }
2126 lexer.configuration.tt.freeAttrs(node);
2127 }
2128
2129
2130 if (node.tag == tt.tagP)
2131 {
2132 if (noMargins(node))
2133 {
2134 Node pre;
2135 Node next;
2136 Node.coerceNode(lexer, node, tt.tagPre);
2137
2138 purgeWord2000Attributes(node);
2139
2140 if (node.content != null)
2141 {
2142 cleanWord2000(lexer, node.content);
2143 }
2144
2145 pre = node;
2146 node = node.next;
2147
2148
2149 while (node.tag == tt.tagP && noMargins(node))
2150 {
2151 next = node.next;
2152 node.removeNode();
2153 pre.insertNodeAtEnd(lexer.newLineNode());
2154 pre.insertNodeAtEnd(node);
2155 stripSpan(lexer, node);
2156 node = next;
2157 }
2158
2159 if (node == null)
2160 {
2161 break;
2162 }
2163 }
2164 }
2165
2166 if (node.tag != null && TidyUtils.toBoolean(node.tag.model & Dict.CM_BLOCK) && singleSpace(lexer, node))
2167 {
2168 node = stripSpan(lexer, node);
2169 continue;
2170 }
2171
2172
2173 if (node.tag == this.tt.tagStyle || node.tag == this.tt.tagMeta || node.type == Node.COMMENT_TAG)
2174 {
2175 node = Node.discardElement(node);
2176 continue;
2177 }
2178
2179
2180 if (node.tag == this.tt.tagSpan || node.tag == this.tt.tagFont)
2181 {
2182 node = stripSpan(lexer, node);
2183 continue;
2184 }
2185
2186 if (node.tag == this.tt.tagLink)
2187 {
2188 AttVal attr = node.getAttrByName("rel");
2189
2190 if (attr != null && attr.value != null && attr.value.equals("File-List"))
2191 {
2192 node = Node.discardElement(node);
2193 continue;
2194 }
2195 }
2196
2197
2198 if (node.content == null && node.tag == this.tt.tagP)
2199 {
2200 node = Node.discardElement(node);
2201 continue;
2202 }
2203
2204 if (node.tag == this.tt.tagP)
2205 {
2206 AttVal attr = node.getAttrByName("class");
2207 AttVal atrStyle = node.getAttrByName("style");
2208
2209
2210
2211
2212
2213
2214
2215 if (attr != null
2216 && attr.value != null
2217 && ((attr.value.equals("MsoListBullet") || attr.value.equals("MsoListNumber"))
2218 || (atrStyle != null && (atrStyle.value.indexOf("mso-list:") != -1))))
2219
2220 {
2221 Dict listType = tt.tagUl;
2222
2223 if (attr.value.equals("MsoListNumber"))
2224 {
2225 listType = tt.tagOl;
2226 }
2227
2228 Node.coerceNode(lexer, node, this.tt.tagLi);
2229
2230 if (list == null || list.tag != listType)
2231 {
2232 list = lexer.inferredTag(listType.name);
2233 Node.insertNodeBeforeElement(node, list);
2234 }
2235
2236 purgeWord2000Attributes(node);
2237
2238 if (node.content != null)
2239 {
2240 cleanWord2000(lexer, node.content);
2241 }
2242
2243
2244 node.removeNode();
2245 list.insertNodeAtEnd(node);
2246 node = list;
2247 }
2248
2249 else if (attr != null && attr.value != null && attr.value.equals("Code"))
2250 {
2251 Node br = lexer.newLineNode();
2252 normalizeSpaces(lexer, node);
2253
2254 if (list == null || list.tag != this.tt.tagPre)
2255 {
2256 list = lexer.inferredTag("pre");
2257 Node.insertNodeBeforeElement(node, list);
2258 }
2259
2260
2261 node.removeNode();
2262 list.insertNodeAtEnd(node);
2263 stripSpan(lexer, node);
2264 list.insertNodeAtEnd(br);
2265 node = list.next;
2266 }
2267 else
2268 {
2269 list = null;
2270 }
2271 }
2272 else
2273 {
2274 list = null;
2275 }
2276
2277
2278 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2279 {
2280 purgeWord2000Attributes(node);
2281 }
2282
2283 if (node.content != null)
2284 {
2285 cleanWord2000(lexer, node.content);
2286 }
2287
2288 node = node.next;
2289 }
2290 }
2291
2292 /**
2293 * Check if the current document is a converted Word document.
2294 * @param root root Node
2295 * @return <code>true</code> if the document has been geenrated by Microsoft Word.
2296 */
2297 public boolean isWord2000(Node root)
2298 {
2299 AttVal attval;
2300 Node node;
2301 Node head;
2302 Node html = root.findHTML(this.tt);
2303
2304 if (html != null && html.getAttrByName("xmlns:o") != null)
2305 {
2306 return true;
2307 }
2308
2309
2310 head = root.findHEAD(tt);
2311
2312 if (head != null)
2313 {
2314 for (node = head.content; node != null; node = node.next)
2315 {
2316 if (node.tag != tt.tagMeta)
2317 {
2318 continue;
2319 }
2320
2321 attval = node.getAttrByName("name");
2322
2323 if (attval == null || attval.value == null)
2324 {
2325 continue;
2326 }
2327
2328 if (!"generator".equals(attval.value))
2329 {
2330 continue;
2331 }
2332
2333 attval = node.getAttrByName("content");
2334
2335 if (attval == null || attval.value == null)
2336 {
2337 continue;
2338 }
2339
2340 if (attval.value.indexOf("Microsoft") != -1)
2341 {
2342 return true;
2343 }
2344 }
2345 }
2346
2347 return false;
2348 }
2349
2350 /**
2351 * Where appropriate move object elements from head to body.
2352 * @param lexer Lexer
2353 * @param html html node
2354 */
2355 static void bumpObject(Lexer lexer, Node html)
2356 {
2357 if (html == null)
2358 {
2359 return;
2360 }
2361
2362 Node node, next, head = null, body = null;
2363 TagTable tt = lexer.configuration.tt;
2364 for (node = html.content; node != null; node = node.next)
2365 {
2366 if (node.tag == tt.tagHead)
2367 {
2368 head = node;
2369 }
2370
2371 if (node.tag == tt.tagBody)
2372 {
2373 body = node;
2374 }
2375 }
2376
2377 if (head != null && body != null)
2378 {
2379 for (node = head.content; node != null; node = next)
2380 {
2381 next = node.next;
2382
2383 if (node.tag == tt.tagObject)
2384 {
2385 Node child;
2386 boolean bump = false;
2387
2388 for (child = node.content; child != null; child = child.next)
2389 {
2390
2391 if ((child.type == Node.TEXT_NODE && !node.isBlank(lexer)) || child.tag != tt.tagParam)
2392 {
2393 bump = true;
2394 break;
2395 }
2396 }
2397
2398 if (bump)
2399 {
2400 node.removeNode();
2401 body.insertNodeAtStart(node);
2402 }
2403 }
2404 }
2405 }
2406 }
2407
2408 }