1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 /***
57 * Clean up misuse of presentation markup. Filters from other formats such as Microsoft Word often make excessive use of
58 * presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is
59 * straight forward to transform this to use CSS. Some rules replace some of the children of an element by style
60 * properties on the element, e.g.
61 * <p>
62 * <b>... </b>
63 * </p>.
64 * <p style="font-weight: bold">
65 * ...
66 * </p>
67 * Such rules are applied to the element's content and then to the element itself until none of the rules more apply.
68 * Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules
69 * strip the element they apply to, replacing it by style properties on the contents, e.g. <dir>
70 * <li>
71 * <p>
72 * ...</li>
73 * </dir>.
74 * <p style="margin-left 1em">
75 * ... These rules are applied to an element before processing its content and replace the current element by the first
76 * element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class
77 * value and style rule in the document head. To support this, an association of styles and class names is built. A
78 * naive approach is to rely on string matching to test when two property lists are the same. A better approach would be
79 * to first sort the properties before matching.
80 * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
81 * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
82 * @author Fabrizio Giustina
83 * @version $Revision: 1.25 $ ($Author: fgiust $)
84 */
85 public class Clean
86 {
87
88 /***
89 * sequential number for generated css classes.
90 */
91 private int classNum = 1;
92
93 /***
94 * Tag table.
95 */
96 private TagTable tt;
97
98 /***
99 * Instantiates a new Clean.
100 * @param tagTable tag table instance
101 */
102 public Clean(TagTable tagTable)
103 {
104 this.tt = tagTable;
105 }
106
107 /***
108 * Insert a css style property.
109 * @param props StyleProp instance
110 * @param name property name
111 * @param value property value
112 * @return StyleProp containin the given property
113 */
114 private StyleProp insertProperty(StyleProp props, String name, String value)
115 {
116 StyleProp first, prev, prop;
117 int cmp;
118
119 prev = null;
120 first = props;
121
122 while (props != null)
123 {
124 cmp = props.name.compareTo(name);
125
126 if (cmp == 0)
127 {
128
129 return first;
130 }
131
132 if (cmp > 0)
133 {
134
135
136 prop = new StyleProp(name, value, props);
137
138 if (prev != null)
139 {
140 prev.next = prop;
141 }
142 else
143 {
144 first = prop;
145 }
146
147 return first;
148 }
149
150 prev = props;
151 props = props.next;
152 }
153
154 prop = new StyleProp(name, value, null);
155
156 if (prev != null)
157 {
158 prev.next = prop;
159 }
160 else
161 {
162 first = prop;
163 }
164
165 return first;
166 }
167
168 /***
169 * Create sorted linked list of properties from style string.
170 * @param prop StyleProp
171 * @param style style string
172 * @return StyleProp with given style
173 */
174 private StyleProp createProps(StyleProp prop, String style)
175 {
176 int nameEnd;
177 int valueEnd;
178 int valueStart = 0;
179 int nameStart = 0;
180 boolean more;
181
182 nameStart = 0;
183 while (nameStart < style.length())
184 {
185 while (nameStart < style.length() && style.charAt(nameStart) == ' ')
186 {
187 ++nameStart;
188 }
189
190 nameEnd = nameStart;
191
192 while (nameEnd < style.length())
193 {
194 if (style.charAt(nameEnd) == ':')
195 {
196 valueStart = nameEnd + 1;
197 break;
198 }
199
200 ++nameEnd;
201 }
202
203 if (nameEnd >= style.length() || style.charAt(nameEnd) != ':')
204 {
205 break;
206 }
207
208 while (valueStart < style.length() && style.charAt(valueStart) == ' ')
209 {
210 ++valueStart;
211 }
212
213 valueEnd = valueStart;
214 more = false;
215
216 while (valueEnd < style.length())
217 {
218 if (style.charAt(valueEnd) == ';')
219 {
220 more = true;
221 break;
222 }
223
224 ++valueEnd;
225 }
226
227 prop = insertProperty(prop, style.substring(nameStart, nameEnd), style.substring(valueStart, valueEnd));
228
229 if (more)
230 {
231 nameStart = valueEnd + 1;
232 continue;
233 }
234
235 break;
236 }
237
238 return prop;
239 }
240
241 /***
242 * Create a css property.
243 * @param props StyleProp
244 * @return css property as String
245 */
246 private String createPropString(StyleProp props)
247 {
248 String style = "";
249 int len;
250 StyleProp prop;
251
252
253 for (len = 0, prop = props; prop != null; prop = prop.next)
254 {
255 len += prop.name.length() + 2;
256 len += prop.value.length() + 2;
257 }
258
259 for (prop = props; prop != null; prop = prop.next)
260 {
261 style = style.concat(prop.name);
262 style = style.concat(": ");
263
264 style = style.concat(prop.value);
265
266 if (prop.next == null)
267 {
268 break;
269 }
270
271 style = style.concat("; ");
272 }
273
274 return style;
275 }
276
277 /***
278 * Creates a string with merged properties.
279 * @param style css style
280 * @param property css properties
281 * @return merged string
282 */
283 private String addProperty(String style, String property)
284 {
285 StyleProp prop;
286
287 prop = createProps(null, style);
288 prop = createProps(prop, property);
289 style = createPropString(prop);
290 return style;
291 }
292
293 /***
294 * Generates a new css class name.
295 * @param lexer Lexer
296 * @param tag Tag
297 * @return generated css class
298 */
299 private String gensymClass(Lexer lexer, String tag)
300 {
301 String str;
302
303 str = lexer.configuration.cssPrefix == null ? lexer.configuration.cssPrefix + this.classNum : "c"
304 + this.classNum;
305 this.classNum++;
306 return str;
307 }
308
309 /***
310 * Finds a css style.
311 * @param lexer Lexer
312 * @param tag tag name
313 * @param properties css properties
314 * @return style string
315 */
316 private String findStyle(Lexer lexer, String tag, String properties)
317 {
318 Style style;
319
320 for (style = lexer.styles; style != null; style = style.next)
321 {
322 if (style.tag.equals(tag) && style.properties.equals(properties))
323 {
324 return style.tagClass;
325 }
326 }
327
328 style = new Style(tag, gensymClass(lexer, tag), properties, lexer.styles);
329 lexer.styles = style;
330 return style.tagClass;
331 }
332
333 /***
334 * Find style attribute in node, and replace it by corresponding class attribute. Search for class in style
335 * dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute.
336 * @param lexer Lexer
337 * @param node node with a style attribute
338 */
339 private void style2Rule(Lexer lexer, Node node)
340 {
341 AttVal styleattr, classattr;
342 String classname;
343
344 styleattr = node.getAttrByName("style");
345
346 if (styleattr != null)
347 {
348 classname = findStyle(lexer, node.element, styleattr.value);
349 classattr = node.getAttrByName("class");
350
351
352
353 if (classattr != null)
354 {
355 classattr.value = classattr.value + " " + classname;
356 node.removeAttribute(styleattr);
357 }
358 else
359 {
360
361 styleattr.attribute = "class";
362 styleattr.value = classname;
363 }
364 }
365 }
366
367 /***
368 * Adds a css rule for color.
369 * @param lexer Lexer
370 * @param selector css selector
371 * @param color color value
372 */
373 private void addColorRule(Lexer lexer, String selector, String color)
374 {
375 if (color != null)
376 {
377 lexer.addStringLiteral(selector);
378 lexer.addStringLiteral(" { color: ");
379 lexer.addStringLiteral(color);
380 lexer.addStringLiteral(" }\n");
381 }
382 }
383
384 /***
385 * Move presentation attribs from body to style element.
386 *
387 * <pre>
388 * background="foo" . body { background-image: url(foo) }
389 * bgcolor="foo" . body { background-color: foo }
390 * text="foo" . body { color: foo }
391 * link="foo" . :link { color: foo }
392 * vlink="foo" . :visited { color: foo }
393 * alink="foo" . :active { color: foo }
394 * </pre>
395 *
396 * @param lexer Lexer
397 * @param body body node
398 */
399 private void cleanBodyAttrs(Lexer lexer, Node body)
400 {
401 AttVal attr;
402 String bgurl = null;
403 String bgcolor = null;
404 String color = null;
405
406 attr = body.getAttrByName("background");
407
408 if (attr != null)
409 {
410 bgurl = attr.value;
411 attr.value = null;
412 body.removeAttribute(attr);
413 }
414
415 attr = body.getAttrByName("bgcolor");
416
417 if (attr != null)
418 {
419 bgcolor = attr.value;
420 attr.value = null;
421 body.removeAttribute(attr);
422 }
423
424 attr = body.getAttrByName("text");
425
426 if (attr != null)
427 {
428 color = attr.value;
429 attr.value = null;
430 body.removeAttribute(attr);
431 }
432
433 if (bgurl != null || bgcolor != null || color != null)
434 {
435 lexer.addStringLiteral(" body {\n");
436
437 if (bgurl != null)
438 {
439 lexer.addStringLiteral(" background-image: url(");
440 lexer.addStringLiteral(bgurl);
441 lexer.addStringLiteral(");\n");
442 }
443
444 if (bgcolor != null)
445 {
446 lexer.addStringLiteral(" background-color: ");
447 lexer.addStringLiteral(bgcolor);
448 lexer.addStringLiteral(";\n");
449 }
450
451 if (color != null)
452 {
453 lexer.addStringLiteral(" color: ");
454 lexer.addStringLiteral(color);
455 lexer.addStringLiteral(";\n");
456 }
457
458 lexer.addStringLiteral(" }\n");
459 }
460
461 attr = body.getAttrByName("link");
462
463 if (attr != null)
464 {
465 addColorRule(lexer, " :link", attr.value);
466 body.removeAttribute(attr);
467 }
468
469 attr = body.getAttrByName("vlink");
470
471 if (attr != null)
472 {
473 addColorRule(lexer, " :visited", attr.value);
474 body.removeAttribute(attr);
475 }
476
477 attr = body.getAttrByName("alink");
478
479 if (attr != null)
480 {
481 addColorRule(lexer, " :active", attr.value);
482 body.removeAttribute(attr);
483 }
484 }
485
486 /***
487 * Check deprecated attributes in body tag.
488 * @param lexer Lexer
489 * @param doc document root node
490 * @return <code>true</code> is the body doesn't contain deprecated attributes, false otherwise.
491 */
492 private boolean niceBody(Lexer lexer, Node doc)
493 {
494 Node body = doc.findBody(lexer.configuration.tt);
495
496 if (body != null)
497 {
498 if (body.getAttrByName("background") != null
499 || body.getAttrByName("bgcolor") != null
500 || body.getAttrByName("text") != null
501 || body.getAttrByName("link") != null
502 || body.getAttrByName("vlink") != null
503 || body.getAttrByName("alink") != null)
504 {
505 lexer.badLayout |= Report.USING_BODY;
506 return false;
507 }
508 }
509
510 return true;
511 }
512
513 /***
514 * Create style element using rules from dictionary.
515 * @param lexer Lexer
516 * @param doc root node
517 */
518 private void createStyleElement(Lexer lexer, Node doc)
519 {
520 Node node, head, body;
521 Style style;
522 AttVal av;
523
524 if (lexer.styles == null && niceBody(lexer, doc))
525 {
526 return;
527 }
528
529 node = lexer.newNode(Node.START_TAG, null, 0, 0, "style");
530 node.implicit = true;
531
532
533 av = new AttVal(null, null, '"', "type", "text/css");
534 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
535 node.attributes = av;
536
537 body = doc.findBody(lexer.configuration.tt);
538
539 lexer.txtstart = lexer.lexsize;
540
541 if (body != null)
542 {
543 cleanBodyAttrs(lexer, body);
544 }
545
546 for (style = lexer.styles; style != null; style = style.next)
547 {
548 lexer.addCharToLexer(' ');
549 lexer.addStringLiteral(style.tag);
550 lexer.addCharToLexer('.');
551 lexer.addStringLiteral(style.tagClass);
552 lexer.addCharToLexer(' ');
553 lexer.addCharToLexer('{');
554 lexer.addStringLiteral(style.properties);
555 lexer.addCharToLexer('}');
556 lexer.addCharToLexer('\n');
557 }
558
559 lexer.txtend = lexer.lexsize;
560
561 node.insertNodeAtEnd(lexer.newNode(Node.TEXT_NODE, lexer.lexbuf, lexer.txtstart, lexer.txtend));
562
563
564
565
566 head = doc.findHEAD(lexer.configuration.tt);
567
568 if (head != null)
569 {
570 head.insertNodeAtEnd(node);
571 }
572 }
573
574 /***
575 * Ensure bidirectional links are consistent.
576 * @param node root node
577 */
578 private void fixNodeLinks(Node node)
579 {
580 Node child;
581
582 if (node.prev != null)
583 {
584 node.prev.next = node;
585 }
586 else
587 {
588 node.parent.content = node;
589 }
590
591 if (node.next != null)
592 {
593 node.next.prev = node;
594 }
595 else
596 {
597 node.parent.last = node;
598 }
599
600 for (child = node.content; child != null; child = child.next)
601 {
602 child.parent = node;
603 }
604 }
605
606 /***
607 * Used to strip child of node when the node has one and only one child.
608 * @param node parent node
609 */
610 private void stripOnlyChild(Node node)
611 {
612 Node child;
613
614 child = node.content;
615 node.content = child.content;
616 node.last = child.last;
617 child.content = null;
618
619 for (child = node.content; child != null; child = child.next)
620 {
621 child.parent = node;
622 }
623 }
624
625 /***
626 * Used to strip font start and end tags.
627 * @param element original node
628 * @param pnode passed in as array to allow modification. pnode[0] will contain the final node
629 * @todo remove the pnode parameter and make it a return value
630 */
631 private void discardContainer(Node element, Node[] pnode)
632 {
633 Node node;
634 Node parent = element.parent;
635
636 if (element.content != null)
637 {
638 element.last.next = element.next;
639
640 if (element.next != null)
641 {
642 element.next.prev = element.last;
643 element.last.next = element.next;
644 }
645 else
646 {
647 parent.last = element.last;
648 }
649
650 if (element.prev != null)
651 {
652 element.content.prev = element.prev;
653 element.prev.next = element.content;
654 }
655 else
656 {
657 parent.content = element.content;
658 }
659
660 for (node = element.content; node != null; node = node.next)
661 {
662 node.parent = parent;
663 }
664
665 pnode[0] = element.content;
666 }
667 else
668 {
669 if (element.next != null)
670 {
671 element.next.prev = element.prev;
672 }
673 else
674 {
675 parent.last = element.prev;
676 }
677
678 if (element.prev != null)
679 {
680 element.prev.next = element.next;
681 }
682 else
683 {
684 parent.content = element.next;
685 }
686
687 pnode[0] = element.next;
688 }
689
690 element.next = null;
691 element.content = null;
692 }
693
694 /***
695 * Add style property to element, creating style attribute as needed and adding ; delimiter.
696 * @param node node
697 * @param property property added to node
698 */
699 private void addStyleProperty(Node node, String property)
700 {
701 AttVal av;
702
703 for (av = node.attributes; av != null; av = av.next)
704 {
705 if (av.attribute.equals("style"))
706 {
707 break;
708 }
709 }
710
711
712
713 if (av != null)
714 {
715 String s;
716
717 s = addProperty(av.value, property);
718 av.value = s;
719 }
720 else
721 {
722
723 av = new AttVal(node.attributes, null, '"', "style", property);
724 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
725 node.attributes = av;
726 }
727 }
728
729 /***
730 * Create new string that consists of the combined style properties in s1 and s2. To merge property lists, we build
731 * a linked list of property/values and insert properties into the list in order, merging values for the same
732 * property name.
733 * @param s1 first property
734 * @param s2 second property
735 * @return merged properties
736 */
737 private String mergeProperties(String s1, String s2)
738 {
739 String s;
740 StyleProp prop;
741
742 prop = createProps(null, s1);
743 prop = createProps(prop, s2);
744 s = createPropString(prop);
745 return s;
746 }
747
748 /***
749 * Merge class attributes from 2 nodes.
750 * @param node Node
751 * @param child Child node
752 */
753 private void mergeClasses(Node node, Node child)
754 {
755 AttVal av;
756 String s1, s2, names;
757
758 for (s2 = null, av = child.attributes; av != null; av = av.next)
759 {
760 if ("class".equals(av.attribute))
761 {
762 s2 = av.value;
763 break;
764 }
765 }
766
767 for (s1 = null, av = node.attributes; av != null; av = av.next)
768 {
769 if ("class".equals(av.attribute))
770 {
771 s1 = av.value;
772 break;
773 }
774 }
775
776 if (s1 != null)
777 {
778 if (s2 != null)
779 {
780 names = s1 + ' ' + s2;
781 av.value = names;
782 }
783 }
784 else if (s2 != null)
785 {
786 av = new AttVal(node.attributes, null, '"', "class", s2);
787 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
788 node.attributes = av;
789 }
790 }
791
792 /***
793 * Merge style from 2 nodes.
794 * @param node Node
795 * @param child Child node
796 */
797 private void mergeStyles(Node node, Node child)
798 {
799 AttVal av;
800 String s1, s2, style;
801
802
803
804 mergeClasses(node, child);
805
806 for (s2 = null, av = child.attributes; av != null; av = av.next)
807 {
808 if (av.attribute.equals("style"))
809 {
810 s2 = av.value;
811 break;
812 }
813 }
814
815 for (s1 = null, av = node.attributes; av != null; av = av.next)
816 {
817 if (av.attribute.equals("style"))
818 {
819 s1 = av.value;
820 break;
821 }
822 }
823
824 if (s1 != null)
825 {
826 if (s2 != null)
827 {
828 style = mergeProperties(s1, s2);
829 av.value = style;
830 }
831 }
832 else if (s2 != null)
833 {
834 av = new AttVal(node.attributes, null, '"', "style", s2);
835 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
836 node.attributes = av;
837 }
838 }
839
840 /***
841 * Map a % font size to a named font size.
842 * @param size size in %
843 * @return font size name
844 */
845 private String fontSize2Name(String size)
846 {
847 String[] sizes = {"60%", "70%", "80%", null, "120%", "150%", "200%"};
848 String buf;
849
850 if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6')
851 {
852 int n = size.charAt(0) - '0';
853 return sizes[n];
854 }
855
856 if (size.length() > 0 && size.charAt(0) == '-')
857 {
858 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
859 {
860 int n = size.charAt(1) - '0';
861 double x;
862
863 for (x = 1.0; n > 0; --n)
864 {
865 x *= 0.8;
866 }
867
868 x *= 100.0;
869 buf = "" + (int) x + "%";
870
871 return buf;
872 }
873
874 return "smaller";
875 }
876
877 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6')
878 {
879 int n = size.charAt(1) - '0';
880 double x;
881
882 for (x = 1.0; n > 0; --n)
883 {
884 x *= 1.2;
885 }
886
887 x *= 100.0;
888 buf = "" + (int) x + "%";
889
890 return buf;
891 }
892
893 return "larger";
894 }
895
896 /***
897 * Adds a font-family style.
898 * @param node Node
899 * @param face font face
900 */
901 private void addFontFace(Node node, String face)
902 {
903 addStyleProperty(node, "font-family: " + face);
904 }
905
906 /***
907 * Adds a font size style.
908 * @param node Node
909 * @param size font size
910 */
911 private void addFontSize(Node node, String size)
912 {
913 String value;
914
915 if (size.equals("6") && node.tag == this.tt.tagP)
916 {
917 node.element = "h1";
918 this.tt.findTag(node);
919 return;
920 }
921
922 if (size.equals("5") && node.tag == this.tt.tagP)
923 {
924 node.element = "h2";
925 this.tt.findTag(node);
926 return;
927 }
928
929 if (size.equals("4") && node.tag == this.tt.tagP)
930 {
931 node.element = "h3";
932 this.tt.findTag(node);
933 return;
934 }
935
936 value = fontSize2Name(size);
937
938 if (value != null)
939 {
940 addStyleProperty(node, "font-size: " + value);
941 }
942 }
943
944 /***
945 * Adds a font color style.
946 * @param node Node
947 * @param color color value
948 */
949 private void addFontColor(Node node, String color)
950 {
951 addStyleProperty(node, "color: " + color);
952 }
953
954 /***
955 * Adds an align style.
956 * @param node Node
957 * @param align align value
958 */
959 private void addAlign(Node node, String align)
960 {
961
962 addStyleProperty(node, "text-align: " + align.toLowerCase());
963 }
964
965 /***
966 * Add style properties to node corresponding to the font face, size and color attributes.
967 * @param node font tag
968 * @param av attribute list for node
969 */
970 private void addFontStyles(Node node, AttVal av)
971 {
972 while (av != null)
973 {
974 if (av.attribute.equals("face"))
975 {
976 addFontFace(node, av.value);
977 }
978 else if (av.attribute.equals("size"))
979 {
980 addFontSize(node, av.value);
981 }
982 else if (av.attribute.equals("color"))
983 {
984 addFontColor(node, av.value);
985 }
986
987 av = av.next;
988 }
989 }
990
991 /***
992 * Symptom: <code><p align=center></code>. Action: <code><p style="text-align: center"></code>.
993 * @param lexer Lexer
994 * @param node node with center attribute. Will be modified to use css style.
995 */
996 private void textAlign(Lexer lexer, Node node)
997 {
998 AttVal av, prev;
999
1000 prev = null;
1001
1002 for (av = node.attributes; av != null; av = av.next)
1003 {
1004 if (av.attribute.equals("align"))
1005 {
1006 if (prev != null)
1007 {
1008 prev.next = av.next;
1009 }
1010 else
1011 {
1012 node.attributes = av.next;
1013 }
1014
1015 if (av.value != null)
1016 {
1017 addAlign(node, av.value);
1018 }
1019
1020 break;
1021 }
1022
1023 prev = av;
1024 }
1025 }
1026
1027 /***
1028 * Symptom: <code><dir><li></code> where <code><li></code> is only child. Action: coerce
1029 * <code><dir> <li></code> to <code><div></code> with indent. The clean up rules use the pnode argument
1030 * to return the next node when the original node has been deleted.
1031 * @param lexer Lexer
1032 * @param node dir tag
1033 * @return <code>true</code> if a dir tag has been coerced to a div
1034 */
1035 private boolean dir2Div(Lexer lexer, Node node)
1036 {
1037 Node child;
1038
1039 if (node.tag == this.tt.tagDir || node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1040 {
1041 child = node.content;
1042
1043 if (child == null)
1044 {
1045 return false;
1046 }
1047
1048
1049 if (child.next != null)
1050 {
1051 return false;
1052 }
1053
1054 if (child.tag != this.tt.tagLi)
1055 {
1056 return false;
1057 }
1058
1059 if (!child.implicit)
1060 {
1061 return false;
1062 }
1063
1064
1065 node.tag = this.tt.tagDiv;
1066 node.element = "div";
1067 addStyleProperty(node, "margin-left: 2em");
1068 stripOnlyChild(node);
1069 return true;
1070 }
1071
1072 return false;
1073 }
1074
1075 /***
1076 * Symptom:
1077 *
1078 * <pre>
1079 * <center>
1080 * </pre>.
1081 * <p>
1082 * Action: replace <code><center></code> by <code><div style="text-align: center"></code>
1083 * </p>
1084 * @param lexer Lexer
1085 * @param node center tag
1086 * @param pnode pnode[0] is the same as node, passed in as an array to allow modification
1087 * @return <code>true</code> if a center tag has been replaced by a div
1088 */
1089 private boolean center2Div(Lexer lexer, Node node, Node[] pnode)
1090 {
1091 if (node.tag == this.tt.tagCenter)
1092 {
1093 if (lexer.configuration.dropFontTags)
1094 {
1095 if (node.content != null)
1096 {
1097 Node last = node.last;
1098 Node parent = node.parent;
1099
1100 discardContainer(node, pnode);
1101
1102 node = lexer.inferredTag("br");
1103
1104 if (last.next != null)
1105 {
1106 last.next.prev = node;
1107 }
1108
1109 node.next = last.next;
1110 last.next = node;
1111 node.prev = last;
1112
1113 if (parent.last == last)
1114 {
1115 parent.last = node;
1116 }
1117
1118 node.parent = parent;
1119 }
1120 else
1121 {
1122 Node prev = node.prev;
1123 Node next = node.next;
1124 Node parent = node.parent;
1125 discardContainer(node, pnode);
1126
1127 node = lexer.inferredTag("br");
1128 node.next = next;
1129 node.prev = prev;
1130 node.parent = parent;
1131
1132 if (next != null)
1133 {
1134 next.prev = node;
1135 }
1136 else
1137 {
1138 parent.last = node;
1139 }
1140
1141 if (prev != null)
1142 {
1143 prev.next = node;
1144 }
1145 else
1146 {
1147 parent.content = node;
1148 }
1149 }
1150
1151 return true;
1152 }
1153 node.tag = this.tt.tagDiv;
1154 node.element = "div";
1155 addStyleProperty(node, "text-align: center");
1156 return true;
1157 }
1158
1159 return false;
1160 }
1161
1162 /***
1163 * Symptom: <code><div><div>...</div></div></code> Action: merge the two divs. This is useful after
1164 * nested <dir>s used by Word for indenting have been converted to <div>s.
1165 * @param lexer Lexer
1166 * @param node first div
1167 * @return true if the divs have been merged
1168 */
1169 private boolean mergeDivs(Lexer lexer, Node node)
1170 {
1171 Node child;
1172
1173 if (node.tag != this.tt.tagDiv)
1174 {
1175 return false;
1176 }
1177
1178 child = node.content;
1179
1180 if (child == null)
1181 {
1182 return false;
1183 }
1184
1185 if (child.tag != this.tt.tagDiv)
1186 {
1187 return false;
1188 }
1189
1190 if (child.next != null)
1191 {
1192 return false;
1193 }
1194
1195 mergeStyles(node, child);
1196 stripOnlyChild(node);
1197 return true;
1198 }
1199
1200 /***
1201 * Symptom:
1202 * <ul>
1203 * <li>
1204 * <ul>
1205 * ...
1206 * </ul>
1207 * </li>
1208 * </ul>
1209 * Action: discard outer list.
1210 * @param lexer Lexer
1211 * @param node Node
1212 * @param pnode passed in as array to allow modifications.
1213 * @return <code>true</code> if nested lists have been found and replaced
1214 */
1215 private boolean nestedList(Lexer lexer, Node node, Node[] pnode)
1216 {
1217 Node child, list;
1218
1219 if (node.tag == this.tt.tagUl || node.tag == this.tt.tagOl)
1220 {
1221 child = node.content;
1222
1223 if (child == null)
1224 {
1225 return false;
1226 }
1227
1228
1229
1230 if (child.next != null)
1231 {
1232 return false;
1233 }
1234
1235 list = child.content;
1236
1237 if (list == null)
1238 {
1239 return false;
1240 }
1241
1242 if (list.tag != node.tag)
1243 {
1244 return false;
1245 }
1246
1247 pnode[0] = list;
1248
1249
1250 list.prev = node.prev;
1251 list.next = node.next;
1252 list.parent = node.parent;
1253 fixNodeLinks(list);
1254
1255
1256
1257 child.content = null;
1258 node.content = null;
1259 node.next = null;
1260 node = null;
1261
1262
1263
1264 if (list.prev != null)
1265 {
1266 if (list.prev.tag == this.tt.tagUl || list.prev.tag == this.tt.tagOl)
1267 {
1268
1269 node = list;
1270 list = node.prev;
1271
1272 list.next = node.next;
1273
1274 if (list.next != null)
1275 {
1276 list.next.prev = list;
1277 }
1278
1279 child = list.last;
1280
1281 node.parent = child;
1282 node.next = null;
1283 node.prev = child.last;
1284 fixNodeLinks(node);
1285 cleanNode(lexer, node);
1286 }
1287 }
1288
1289 return true;
1290 }
1291
1292 return false;
1293 }
1294
1295 /***
1296 * Symptom: the only child of a block-level element is a presentation element such as B, I or FONT. Action: add
1297 * style "font-weight: bold" to the block and strip the <b>element, leaving its children. example:
1298 *
1299 * <pre>
1300 * <p>
1301 * <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1302 * </p>
1303 * </pre>
1304 *
1305 * becomes:
1306 *
1307 * <pre>
1308 * <p style="font-weight: bold; font-family: Arial; font-size: 6">
1309 * Draft Recommended Practice
1310 * </p>
1311 * </pre>
1312 *
1313 * <p>
1314 * This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator
1315 * 4, this isn't done for the elements: caption, tr and table
1316 * </p>
1317 * @param lexer Lexer
1318 * @param node parent node
1319 * @return <code>true</code> if the child node has been removed
1320 */
1321 private boolean blockStyle(Lexer lexer, Node node)
1322 {
1323 Node child;
1324
1325 if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1326 {
1327 if (node.tag != this.tt.tagTable && node.tag != this.tt.tagTr && node.tag != this.tt.tagLi)
1328 {
1329
1330 if (node.tag != this.tt.tagCaption)
1331 {
1332 textAlign(lexer, node);
1333 }
1334
1335 child = node.content;
1336
1337 if (child == null)
1338 {
1339 return false;
1340 }
1341
1342
1343 if (child.next != null)
1344 {
1345 return false;
1346 }
1347
1348 if (child.tag == this.tt.tagB)
1349 {
1350 mergeStyles(node, child);
1351 addStyleProperty(node, "font-weight: bold");
1352 stripOnlyChild(node);
1353 return true;
1354 }
1355
1356 if (child.tag == this.tt.tagI)
1357 {
1358 mergeStyles(node, child);
1359 addStyleProperty(node, "font-style: italic");
1360 stripOnlyChild(node);
1361 return true;
1362 }
1363
1364 if (child.tag == this.tt.tagFont)
1365 {
1366 mergeStyles(node, child);
1367 addFontStyles(node, child.attributes);
1368 stripOnlyChild(node);
1369 return true;
1370 }
1371 }
1372 }
1373
1374 return false;
1375 }
1376
1377 /***
1378 * If the node has only one b, i, or font child remove the child node and add the appropriate style attributes to
1379 * parent.
1380 * @param lexer Lexer
1381 * @param node parent node
1382 * @param pnode passed as an array to allow modifications
1383 * @return <code>true</code> if child node has been stripped, replaced by style attributes.
1384 */
1385 private boolean inlineStyle(Lexer lexer, Node node, Node[] pnode)
1386 {
1387 Node child;
1388
1389 if (node.tag != this.tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0)
1390 {
1391 child = node.content;
1392
1393 if (child == null)
1394 {
1395 return false;
1396 }
1397
1398
1399 if (child.next != null)
1400 {
1401 return false;
1402 }
1403
1404 if (child.tag == this.tt.tagB && lexer.configuration.logicalEmphasis)
1405 {
1406 mergeStyles(node, child);
1407 addStyleProperty(node, "font-weight: bold");
1408 stripOnlyChild(node);
1409 return true;
1410 }
1411
1412 if (child.tag == this.tt.tagI && lexer.configuration.logicalEmphasis)
1413 {
1414 mergeStyles(node, child);
1415 addStyleProperty(node, "font-style: italic");
1416 stripOnlyChild(node);
1417 return true;
1418 }
1419
1420 if (child.tag == this.tt.tagFont)
1421 {
1422 mergeStyles(node, child);
1423 addFontStyles(node, child.attributes);
1424 stripOnlyChild(node);
1425 return true;
1426 }
1427 }
1428
1429 return false;
1430 }
1431
1432 /***
1433 * Replace font elements by span elements, deleting the font element's attributes and replacing them by a single
1434 * style attribute.
1435 * @param lexer Lexer
1436 * @param node font tag
1437 * @param pnode passed as an array to allow modifications
1438 * @return <code>true</code> if a font tag has been dropped and replaced by style attributes
1439 */
1440 private boolean font2Span(Lexer lexer, Node node, Node[] pnode)
1441 {
1442 AttVal av, style, next;
1443
1444 if (node.tag == this.tt.tagFont)
1445 {
1446 if (lexer.configuration.dropFontTags)
1447 {
1448 discardContainer(node, pnode);
1449 return false;
1450 }
1451
1452
1453 if (node.parent.content == node && node.next == null)
1454 {
1455 return false;
1456 }
1457
1458 addFontStyles(node, node.attributes);
1459
1460
1461 av = node.attributes;
1462 style = null;
1463
1464 while (av != null)
1465 {
1466 next = av.next;
1467
1468 if (av.attribute.equals("style"))
1469 {
1470 av.next = null;
1471 style = av;
1472 }
1473
1474 av = next;
1475 }
1476
1477 node.attributes = style;
1478
1479 node.tag = this.tt.tagSpan;
1480 node.element = "span";
1481
1482 return true;
1483 }
1484
1485 return false;
1486 }
1487
1488 /***
1489 * Applies all matching rules to a node.
1490 * @param lexer Lexer
1491 * @param node original node
1492 * @return cleaned up node
1493 */
1494 private Node cleanNode(Lexer lexer, Node node)
1495 {
1496 Node next = null;
1497 Node[] o = new Node[1];
1498 boolean b = false;
1499
1500 for (next = node; node != null && node.isElement(); node = next)
1501 {
1502 o[0] = next;
1503
1504 b = dir2Div(lexer, node);
1505 next = o[0];
1506 if (b)
1507 {
1508 continue;
1509 }
1510
1511
1512
1513 b = nestedList(lexer, node, o);
1514 next = o[0];
1515 if (b)
1516 {
1517 return next;
1518 }
1519
1520 b = center2Div(lexer, node, o);
1521 next = o[0];
1522 if (b)
1523 {
1524 continue;
1525 }
1526
1527 b = mergeDivs(lexer, node);
1528 next = o[0];
1529 if (b)
1530 {
1531 continue;
1532 }
1533
1534 b = blockStyle(lexer, node);
1535 next = o[0];
1536 if (b)
1537 {
1538 continue;
1539 }
1540
1541 b = inlineStyle(lexer, node, o);
1542 next = o[0];
1543 if (b)
1544 {
1545 continue;
1546 }
1547
1548 b = font2Span(lexer, node, o);
1549 next = o[0];
1550 if (b)
1551 {
1552 continue;
1553 }
1554
1555 break;
1556 }
1557
1558 return next;
1559 }
1560
1561 /***
1562 * Special case: if the current node is destroyed by CleanNode() lower in the tree, this node and its parent no
1563 * longer exist. So we must jump back up the CreateStyleProperties() call stack until we have a valid node
1564 * reference.
1565 * @param lexer Lexer
1566 * @param node Node
1567 * @param prepl passed in as array to allow modifications
1568 * @return cleaned Node
1569 */
1570 private Node createStyleProperties(Lexer lexer, Node node, Node[] prepl)
1571 {
1572 Node child;
1573
1574 if (node.content != null)
1575 {
1576 Node[] repl = new Node[1];
1577 repl[0] = node;
1578 for (child = node.content; child != null; child = child.next)
1579 {
1580 child = createStyleProperties(lexer, child, repl);
1581 if (repl[0] != node)
1582 {
1583 return repl[0];
1584 }
1585 }
1586 }
1587
1588 return cleanNode(lexer, node);
1589 }
1590
1591 /***
1592 * Find style attribute in node content, and replace it by corresponding class attribute.
1593 * @param lexer Lexer
1594 * @param node parent node
1595 */
1596 private void defineStyleRules(Lexer lexer, Node node)
1597 {
1598 Node child;
1599
1600 if (node.content != null)
1601 {
1602 child = node.content;
1603 while (child != null)
1604 {
1605 defineStyleRules(lexer, child);
1606 child = child.next;
1607 }
1608 }
1609
1610 style2Rule(lexer, node);
1611 }
1612
1613 /***
1614 * Clean an html tree.
1615 * @param lexer Lexer
1616 * @param doc root node
1617 */
1618 public void cleanTree(Lexer lexer, Node doc)
1619 {
1620 Node[] repl = new Node[1];
1621 repl[0] = doc;
1622 doc = createStyleProperties(lexer, doc, repl);
1623
1624 if (!lexer.configuration.makeClean)
1625 {
1626 defineStyleRules(lexer, doc);
1627 createStyleElement(lexer, doc);
1628 }
1629 }
1630
1631 /***
1632 * simplifies <b><b>... </b> ... </b> etc.
1633 * @param node root Node
1634 */
1635 public void nestedEmphasis(Node node)
1636 {
1637 Node[] o = new Node[1];
1638 Node next;
1639
1640 while (node != null)
1641 {
1642 next = node.next;
1643
1644 if ((node.tag == this.tt.tagB || node.tag == this.tt.tagI)
1645 && node.parent != null
1646 && node.parent.tag == node.tag)
1647 {
1648
1649 o[0] = next;
1650 discardContainer(node, o);
1651 next = o[0];
1652 node = next;
1653 continue;
1654 }
1655
1656 if (node.content != null)
1657 {
1658 nestedEmphasis(node.content);
1659 }
1660
1661 node = next;
1662 }
1663 }
1664
1665 /***
1666 * Replace i by em and b by strong.
1667 * @param node root Node
1668 */
1669 public void emFromI(Node node)
1670 {
1671 while (node != null)
1672 {
1673 if (node.tag == this.tt.tagI)
1674 {
1675 node.element = this.tt.tagEm.name;
1676 node.tag = this.tt.tagEm;
1677 }
1678 else if (node.tag == this.tt.tagB)
1679 {
1680 node.element = this.tt.tagStrong.name;
1681 node.tag = this.tt.tagStrong;
1682 }
1683
1684 if (node.content != null)
1685 {
1686 emFromI(node.content);
1687 }
1688
1689 node = node.next;
1690 }
1691 }
1692
1693 /***
1694 * Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single
1695 * implicit li. This is recursively replaced by an implicit blockquote.
1696 * @param node root Node
1697 */
1698 public void list2BQ(Node node)
1699 {
1700 while (node != null)
1701 {
1702 if (node.content != null)
1703 {
1704 list2BQ(node.content);
1705 }
1706
1707 if (node.tag != null
1708 && node.tag.getParser() == ParserImpl.LIST
1709 && node.hasOneChild()
1710 && node.content.implicit)
1711 {
1712 stripOnlyChild(node);
1713 node.element = this.tt.tagBlockquote.name;
1714 node.tag = this.tt.tagBlockquote;
1715 node.implicit = true;
1716 }
1717
1718 node = node.next;
1719 }
1720 }
1721
1722 /***
1723 * Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with
1724 * the indent set to match the nesting depth.
1725 * @param node root Node
1726 */
1727 public void bQ2Div(Node node)
1728 {
1729 int indent;
1730 String indentBuf;
1731 AttVal attval;
1732
1733 while (node != null)
1734 {
1735 if (node.tag == this.tt.tagBlockquote && node.implicit)
1736 {
1737 indent = 1;
1738
1739 while (node.hasOneChild() && node.content.tag == this.tt.tagBlockquote && node.implicit)
1740 {
1741 ++indent;
1742 stripOnlyChild(node);
1743 }
1744
1745 if (node.content != null)
1746 {
1747 bQ2Div(node.content);
1748 }
1749
1750 indentBuf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1751
1752 node.element = this.tt.tagDiv.name;
1753 node.tag = this.tt.tagDiv;
1754
1755 attval = node.getAttrByName("style");
1756
1757 if (attval != null && attval.value != null)
1758 {
1759 attval.value = indentBuf + "; " + attval.value;
1760 }
1761 else
1762 {
1763 node.addAttribute("style", indentBuf);
1764 }
1765 }
1766 else if (node.content != null)
1767 {
1768 bQ2Div(node.content);
1769 }
1770
1771 node = node.next;
1772 }
1773 }
1774
1775 /***
1776 * Find the enclosing table cell for the given node.
1777 * @param node Node
1778 * @return enclosing cell node
1779 */
1780 Node findEnclosingCell(Node node)
1781 {
1782 Node check;
1783
1784 for (check = node; check != null; check = check.parent)
1785 {
1786 if (check.tag == tt.tagTd)
1787 {
1788 return check;
1789 }
1790 }
1791 return null;
1792 }
1793
1794 /***
1795 * node is <code><![if ...]></code> prune up to <code><![endif]></code>.
1796 * @param lexer Lexer
1797 * @param node Node
1798 * @return cleaned up Node
1799 */
1800 public Node pruneSection(Lexer lexer, Node node)
1801 {
1802 for (;;)
1803 {
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820 node = Node.discardElement(node);
1821
1822 if (node == null)
1823 {
1824 return null;
1825 }
1826
1827 if (node.type == Node.SECTION_TAG)
1828 {
1829 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if"))
1830 {
1831 node = pruneSection(lexer, node);
1832 continue;
1833 }
1834
1835 if ((TidyUtils.getString(node.textarray, node.start, 5)).equals("endif"))
1836 {
1837 node = Node.discardElement(node);
1838 break;
1839 }
1840 }
1841 }
1842
1843 return node;
1844 }
1845
1846 /***
1847 * Drop if/endif sections inserted by word2000.
1848 * @param lexer Lexer
1849 * @param node Node root node
1850 */
1851 public void dropSections(Lexer lexer, Node node)
1852 {
1853 while (node != null)
1854 {
1855 if (node.type == Node.SECTION_TAG)
1856 {
1857
1858 if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if")
1859 && (!(TidyUtils.getString(node.textarray, node.start, 7)).equals("if !vml")))
1860
1861 {
1862 node = pruneSection(lexer, node);
1863 continue;
1864 }
1865
1866
1867 node = Node.discardElement(node);
1868 continue;
1869 }
1870
1871 if (node.content != null)
1872 {
1873 dropSections(lexer, node.content);
1874 }
1875
1876 node = node.next;
1877 }
1878 }
1879
1880 /***
1881 * Remove word2000 attributes from node.
1882 * @param node node to cleanup
1883 */
1884 public void purgeWord2000Attributes(Node node)
1885 {
1886 AttVal attr = null;
1887 AttVal next = null;
1888 AttVal prev = null;
1889
1890 for (attr = node.attributes; attr != null; attr = next)
1891 {
1892 next = attr.next;
1893
1894
1895
1896 if (attr.attribute != null && attr.value != null && attr.attribute.equals("class"))
1897 {
1898 if (attr.value.equals("Code") || !attr.value.startsWith("Mso"))
1899 {
1900 prev = attr;
1901 continue;
1902 }
1903 }
1904
1905 if (attr.attribute != null
1906 && (attr.attribute.equals("class")
1907 || attr.attribute.equals("style")
1908 || attr.attribute.equals("lang")
1909 || attr.attribute.startsWith("x:") || ((attr.attribute.equals("height") || attr.attribute
1910 .equals("width")) &&
1911 (node.tag == this.tt.tagTd || node.tag == this.tt.tagTr || node.tag == this.tt.tagTh))))
1912 {
1913 if (prev != null)
1914 {
1915 prev.next = next;
1916 }
1917 else
1918 {
1919 node.attributes = next;
1920 }
1921
1922 }
1923 else
1924 {
1925 prev = attr;
1926 }
1927 }
1928 }
1929
1930 /***
1931 * Word2000 uses span excessively, so we strip span out.
1932 * @param lexer Lexer
1933 * @param span Node span
1934 * @return cleaned node
1935 */
1936 public Node stripSpan(Lexer lexer, Node span)
1937 {
1938 Node node;
1939 Node prev = null;
1940 Node content;
1941
1942
1943
1944
1945 cleanWord2000(lexer, span.content);
1946 content = span.content;
1947
1948 if (span.prev != null)
1949 {
1950 prev = span.prev;
1951 }
1952 else if (content != null)
1953 {
1954 node = content;
1955 content = content.next;
1956 node.removeNode();
1957 Node.insertNodeBeforeElement(span, node);
1958 prev = node;
1959 }
1960
1961 while (content != null)
1962 {
1963 node = content;
1964 content = content.next;
1965 node.removeNode();
1966 prev.insertNodeAfterElement(node);
1967 prev = node;
1968 }
1969
1970 if (span.next == null)
1971 {
1972 span.parent.last = prev;
1973 }
1974
1975 node = span.next;
1976 span.content = null;
1977 Node.discardElement(span);
1978 return node;
1979 }
1980
1981 /***
1982 * Map non-breaking spaces to regular spaces.
1983 * @param lexer Lexer
1984 * @param node Node
1985 */
1986 private void normalizeSpaces(Lexer lexer, Node node)
1987 {
1988 while (node != null)
1989 {
1990 if (node.content != null)
1991 {
1992 normalizeSpaces(lexer, node.content);
1993 }
1994
1995 if (node.type == Node.TEXT_NODE)
1996 {
1997 int i;
1998 int[] c = new int[1];
1999 int p = node.start;
2000
2001 for (i = node.start; i < node.end; ++i)
2002 {
2003 c[0] = node.textarray[i];
2004
2005
2006 if (c[0] > 0x7F)
2007 {
2008 i += PPrint.getUTF8(node.textarray, i, c);
2009 }
2010
2011 if (c[0] == 160)
2012 {
2013 c[0] = ' ';
2014 }
2015
2016 p = PPrint.putUTF8(node.textarray, p, c[0]);
2017 }
2018 }
2019
2020 node = node.next;
2021 }
2022 }
2023
2024 /***
2025 * Used to hunt for hidden preformatted sections.
2026 * @param node checked node
2027 * @return <code>true</code> if the node has a "margin-top: 0" or "margin-bottom: 0" style
2028 */
2029 boolean noMargins(Node node)
2030 {
2031 AttVal attval = node.getAttrByName("style");
2032
2033 if (attval == null || attval.value == null)
2034 {
2035 return false;
2036 }
2037
2038
2039 if (attval.value.indexOf("margin-top: 0") == -1)
2040 {
2041 return false;
2042 }
2043
2044
2045 if (attval.value.indexOf("margin-bottom: 0") == -1)
2046 {
2047 return false;
2048 }
2049
2050 return true;
2051 }
2052
2053 /***
2054 * Does element have a single space as its content?
2055 * @param lexer Lexer
2056 * @param node checked node
2057 * @return <code>true</code> if the element has a single space as its content
2058 */
2059 boolean singleSpace(Lexer lexer, Node node)
2060 {
2061 if (node.content != null)
2062 {
2063 node = node.content;
2064
2065 if (node.next != null)
2066 {
2067 return false;
2068 }
2069
2070 if (node.type != Node.TEXT_NODE)
2071 {
2072 return false;
2073 }
2074
2075 if (((node.end - node.start) == 1) && lexer.lexbuf[node.start] == ' ')
2076 {
2077 return true;
2078 }
2079
2080 if ((node.end - node.start) == 2)
2081 {
2082 int[] c = new int[1];
2083
2084 PPrint.getUTF8(lexer.lexbuf, node.start, c);
2085
2086 if (c[0] == 160)
2087 {
2088 return true;
2089 }
2090 }
2091 }
2092
2093 return false;
2094 }
2095
2096 /***
2097 * This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It
2098 * doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags,
2099 * such as o:p which needs to be declared as inline.
2100 * @param lexer Lexer
2101 * @param node node to clean up
2102 */
2103 public void cleanWord2000(Lexer lexer, Node node)
2104 {
2105
2106 Node list = null;
2107
2108 while (node != null)
2109 {
2110
2111
2112 if (node.tag == tt.tagHtml)
2113 {
2114
2115 if ((node.getAttrByName("xmlns:o") == null))
2116 {
2117 return;
2118 }
2119 lexer.configuration.tt.freeAttrs(node);
2120 }
2121
2122
2123 if (node.tag == tt.tagP)
2124 {
2125 if (noMargins(node))
2126 {
2127 Node pre;
2128 Node next;
2129 Node.coerceNode(lexer, node, tt.tagPre);
2130
2131 purgeWord2000Attributes(node);
2132
2133 if (node.content != null)
2134 {
2135 cleanWord2000(lexer, node.content);
2136 }
2137
2138 pre = node;
2139 node = node.next;
2140
2141
2142 while (node.tag == tt.tagP && noMargins(node))
2143 {
2144 next = node.next;
2145 node.removeNode();
2146 pre.insertNodeAtEnd(lexer.newLineNode());
2147 pre.insertNodeAtEnd(node);
2148 stripSpan(lexer, node);
2149 node = next;
2150 }
2151
2152 if (node == null)
2153 {
2154 break;
2155 }
2156 }
2157 }
2158
2159 if (node.tag != null && TidyUtils.toBoolean(node.tag.model & Dict.CM_BLOCK) && singleSpace(lexer, node))
2160 {
2161 node = stripSpan(lexer, node);
2162 continue;
2163 }
2164
2165
2166 if (node.tag == this.tt.tagStyle || node.tag == this.tt.tagMeta || node.type == Node.COMMENT_TAG)
2167 {
2168 node = Node.discardElement(node);
2169 continue;
2170 }
2171
2172
2173 if (node.tag == this.tt.tagSpan || node.tag == this.tt.tagFont)
2174 {
2175 node = stripSpan(lexer, node);
2176 continue;
2177 }
2178
2179 if (node.tag == this.tt.tagLink)
2180 {
2181 AttVal attr = node.getAttrByName("rel");
2182
2183 if (attr != null && attr.value != null && attr.value.equals("File-List"))
2184 {
2185 node = Node.discardElement(node);
2186 continue;
2187 }
2188 }
2189
2190
2191 if (node.content == null && node.tag == this.tt.tagP)
2192 {
2193 node = Node.discardElement(node);
2194 continue;
2195 }
2196
2197 if (node.tag == this.tt.tagP)
2198 {
2199 AttVal attr = node.getAttrByName("class");
2200 AttVal atrStyle = node.getAttrByName("style");
2201
2202
2203
2204
2205
2206
2207
2208 if (attr != null
2209 && attr.value != null
2210 && ((attr.value.equals("MsoListBullet") || attr.value.equals("MsoListNumber"))
2211 || (atrStyle != null && (atrStyle.value.indexOf("mso-list:") != -1))))
2212
2213 {
2214 Dict listType = tt.tagUl;
2215
2216 if (attr.value.equals("MsoListNumber"))
2217 {
2218 listType = tt.tagOl;
2219 }
2220
2221 Node.coerceNode(lexer, node, this.tt.tagLi);
2222
2223 if (list == null || list.tag != listType)
2224 {
2225 list = lexer.inferredTag(listType.name);
2226 Node.insertNodeBeforeElement(node, list);
2227 }
2228
2229 purgeWord2000Attributes(node);
2230
2231 if (node.content != null)
2232 {
2233 cleanWord2000(lexer, node.content);
2234 }
2235
2236
2237 node.removeNode();
2238 list.insertNodeAtEnd(node);
2239 node = list;
2240 }
2241
2242 else if (attr != null && attr.value != null && attr.value.equals("Code"))
2243 {
2244 Node br = lexer.newLineNode();
2245 normalizeSpaces(lexer, node);
2246
2247 if (list == null || list.tag != this.tt.tagPre)
2248 {
2249 list = lexer.inferredTag("pre");
2250 Node.insertNodeBeforeElement(node, list);
2251 }
2252
2253
2254 node.removeNode();
2255 list.insertNodeAtEnd(node);
2256 stripSpan(lexer, node);
2257 list.insertNodeAtEnd(br);
2258 node = list.next;
2259 }
2260 else
2261 {
2262 list = null;
2263 }
2264 }
2265 else
2266 {
2267 list = null;
2268 }
2269
2270
2271 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2272 {
2273 purgeWord2000Attributes(node);
2274 }
2275
2276 if (node.content != null)
2277 {
2278 cleanWord2000(lexer, node.content);
2279 }
2280
2281 node = node.next;
2282 }
2283 }
2284
2285 /***
2286 * Check if the current document is a converted Word document.
2287 * @param root root Node
2288 * @return <code>true</code> if the document has been geenrated by Microsoft Word.
2289 */
2290 public boolean isWord2000(Node root)
2291 {
2292 AttVal attval;
2293 Node node;
2294 Node head;
2295 Node html = root.findHTML(this.tt);
2296
2297 if (html != null && html.getAttrByName("xmlns:o") != null)
2298 {
2299 return true;
2300 }
2301
2302
2303 head = root.findHEAD(tt);
2304
2305 if (head != null)
2306 {
2307 for (node = head.content; node != null; node = node.next)
2308 {
2309 if (node.tag != tt.tagMeta)
2310 {
2311 continue;
2312 }
2313
2314 attval = node.getAttrByName("name");
2315
2316 if (attval == null || attval.value == null)
2317 {
2318 continue;
2319 }
2320
2321 if (!"generator".equals(attval.value))
2322 {
2323 continue;
2324 }
2325
2326 attval = node.getAttrByName("content");
2327
2328 if (attval == null || attval.value == null)
2329 {
2330 continue;
2331 }
2332
2333 if (attval.value.indexOf("Microsoft") != -1)
2334 {
2335 return true;
2336 }
2337 }
2338 }
2339
2340 return false;
2341 }
2342
2343 /***
2344 * Where appropriate move object elements from head to body.
2345 * @param lexer Lexer
2346 * @param html html node
2347 */
2348 static void bumpObject(Lexer lexer, Node html)
2349 {
2350 if (html == null)
2351 {
2352 return;
2353 }
2354
2355 Node node, next, head = null, body = null;
2356 TagTable tt = lexer.configuration.tt;
2357 for (node = html.content; node != null; node = node.next)
2358 {
2359 if (node.tag == tt.tagHead)
2360 {
2361 head = node;
2362 }
2363
2364 if (node.tag == tt.tagBody)
2365 {
2366 body = node;
2367 }
2368 }
2369
2370 if (head != null && body != null)
2371 {
2372 for (node = head.content; node != null; node = next)
2373 {
2374 next = node.next;
2375
2376 if (node.tag == tt.tagObject)
2377 {
2378 Node child;
2379 boolean bump = false;
2380
2381 for (child = node.content; child != null; child = child.next)
2382 {
2383
2384 if ((child.type == Node.TEXT_NODE && !node.isBlank(lexer)) || child.tag != tt.tagParam)
2385 {
2386 bump = true;
2387 break;
2388 }
2389 }
2390
2391 if (bump)
2392 {
2393 node.removeNode();
2394 body.insertNodeAtStart(node);
2395 }
2396 }
2397 }
2398 }
2399 }
2400
2401 }