=Paper=
{{Paper
|id=Vol-1536/paper3
|storemode=property
|title=
CRL: язык правил анализа и интерпретации произвольных таблиц
(CRL: A Rule Language for Analysis and Interpretation of Arbitrary Tables)
|pdfUrl=https://ceur-ws.org/Vol-1536/paper3.pdf
|volume=Vol-1536
|dblpUrl=https://dblp.org/rec/conf/rcdl/ShigarovP15
}}
==
CRL: язык правил анализа и интерпретации произвольных таблиц
(CRL: A Rule Language for Analysis and Interpretation of Arbitrary Tables)
==
CRL:
© . . © . .
. .. ,
shigarov@icc.ru slv@icc.ru
$ #) # [1, 5, 6, 14, 15].
) $# -
! " # " #
$ %!& #, !& " $ # $ %!& #.
' # , . %!) $!
( #! $'% & 3 CRL (Cells Rule Language).
' ) " # !& & !) $! CRL $ &
!&. % $ # #, !)
' *) [15]. $'
" # $ $ % ) #! , ' $'!& # !%
' ) " , !% $ $ ! %! ! # !&
$ !& ! ETL . $ #.
#%!) $! # ! CRL $
$ # & #, % + '
$ +) $! % ! ! * ! " :
+& '& ', & !
*). - # $ & .
# # -&
. %! CRL ! CRL $
$! % $'!& #, ) ! $-
+& & ) ). Drools [8] - ! $! .
1 $ % CRL DRL
1 (Drools Rule Language) [8] # %
& Drools Expert [8].
#!, ! " * '% ) ! $
' # ( , Excel), + $ . $ 2
!& '!& !&. $ ! ! $ # #.
( #!, ' «# !&», '!& !& "
!% $! $ !&. ! $ 3.
% ) " # 6' ! # ! %$
. $! CRL $ 4.
# !& '!&
!& % ETL #, 2
'+) + -!: $ ' !&
$ $ %!& #, " # & $ ! ! $ #
$$ $ !&. # $% !: -
1 " # '!& !& ! -$ !.
'& *), .. - ! ! [3, 16, 17]
: « & - », « - » « - ! %$ ) $
» (. 1). , + ) $), ! +& ! . (
#, - $' $! % ! $ % -
$! #!
XVII ) ) .
DAMDID/RCDL’2015 « , TANGO [16]
# %$
», ! , 13-16 ") !&. 6!) ")
2015 $ % !) !&
22
'! ! , %$ ! !&, + % #.
! !, !! ! $ 3. !
! ( , WordNet). Embley . [3] $! $! CRL
%$ $ # , DRL. %! $! $),
'+ % ! ") $'! $'!&
!& ) Tijerino . [16]. ( ) #.
8) ! $ $% # 6 , $! CRL $ %$ %
9 . Wang . [17] - " #,
# " %!& )
# & , ) ( ) $! YAML,
! $ $) PROBASE. -$ , "'
'! ! # [4, 17, " , #.
18] + %$ !
$ -$! #. 3 " #
- ' ,
' $ ' " # $
#! ' $
) ) " #.
-$ ! ! [2, 4, 10-13],
*& $) )
, ! $ #
), ) % )
" # $ #.
, Gatterbauer . [4]
$ '% )
) " # " CSS2. -
%$ $'! . 1. & , , & $
%& +& HTML #!.
#. ( Pivk . [12, 13]
TARTAR
$# " # HTML # '!& !& !
" ( ' ' % #!,
") !). TARTAR [18], $ !:
- & # 3-& « & », « » « ». &
!& . Kim . [10] %$ $' !&, !
$ ), ) - ) #. 6 $' !
$! ) " # $ #, ! % ) % , !
!& & !& !& !% ! $ #!, ,
5- #. & & Embley $ #!, " #),
Nagy [2, 11] " # +) . 6
(HTML) # # $ !&. % ) ( ).
%$ '% $ ! $ ! % &
#!, ! , $ . - ,
$ # ) $! !& $ ! , !
) . [2, 11] % ) . 6 &
%& ' +& !% # % ) )
!& ! & ) . !
!& #. . 1.
'! -$ ! ! # CRL &
# ! %$ ' " #
' ) &, !% " . ')
& !& #. 1 ! CRL !%
! ! ! "! $ '!& :
! ' ! #, ! '), & , .
$ % % $# (cell) 3 , $-
! ! ) ' % ' & !& '!&
). !&, $ '!& $ ) ') #!.
$ ! , * 6!) ", +) '),
& $ $% + &, ! '$
#& ': $ ! !. + ( !! $% -
<$ ! ! ! *! *" ):
23
1. !: cl 3 !) cr 3 !) #, ' . ? $% '
rt 3 & rb 3 ; $ +& ' !& : cell, entry, label
2. ! ) style, ! $ category, +& "
!&: font 3 *", horzAlignment 3 : '), & ,
$ % vertAlignment 3 - , $ ! ),
! ‘$’. $
% ! , # fgColor 3
%$ % &
bgColor 3 $ , !
& &, ! !!
# '!& # (leftBorder 3 ),
"!. - !% '!
topBorder 3 &), rightBorder 3 ', ! ! $* !
&), bottomBorder 3 &)); "!. $% ) '
3. text 3 , indent 3 $ ' ‘:’. @ '
( ' ' ); ! , !'
4. '! !: entries 3 & ) $' !&
labels 3 , !& $ ) («» « %»). '
'). $! ! MVEL.
+% $ ) ‘,’,
(entry), '
$') !& $ 9#. ) :
+& ): value 3 $';
cell $cell : constraints
cell 3 ! ') & ;
entry $entry : constraints
labels 3 # !& .
label $label : constraints
(label) 3 , +
category $category : constraints
, ' + : value 3
$'; $ !& $% CRL
'), cell 3 ! >; category 3 #& & CRL , ' !
! , ) !! ! *" ,
; ' ' & , parent 3 !& .
+ % , children 3 4.2 (
> '& .
% & (category) ' - #. , $%
" , ! +& +% 9 ') & ',
+& ): name 3 ; labels & , +
3 +& ) . $', - $!
$! . & $)
! ! $ ! Java $ $# #! !%
! * JavaBeans + % $
[8]. 1 $ %$ % & - !& '.
'!& !& " )
, $+) #"- F # $ $%
# «JSR 94: Java Rule Engine API» [10]. - 9 ') $cell, + $ n
!) " -$ $ -& , n ', $ !& %
. > !
&:
4 CRL split $cell
6 ! " ) i ') !'
CRL # ! . &
i & ) '). '),
'% , ! !
'! $% $,
$ ! "!, 3 ,
' %, , &
$ + #! +
9 ') $ >.
' * > - .
) %
' ! # CRL . a b a a b b
#"# - $!, c d c d c d c d
) $ CRL ) DRL g 1 2 e g 1 1 1 2
#, : e
h 3 e h 3 4 4 5
http://cells.icc.ru/pub/crl. 4 5
f g 6 f g 6 4 4 5
4.1 ' ɚ ɛ
. 2. 9! () $! () ').
? $! ) '
$ !% $ ') '),
& , $!
24
+ CRL $ $% cell $c : rt>$corner.rb, cr<=$corner.cr
! 9! ') (. 2, ɚ). then mark @RowHeading -> $c
$% #! % , $!) ( . 1 #
. 2, ɛ: ' $ & )
when cell $c : cl!=cr || rt!=rb, !blank ) ') $cell. -
then split $c +) " $' &
!) #. ! #! $ 3 ! !
% * ' $>! '). entry_value label_value :
!& ' ' % new entry entry_value -> $cell
) ) '). & new label label_value -> $cell
'&, % 9>! ') ', $' $
"# ! # * & -
#, ! CRL
') $cell, %$ % +
.
$ " :
! ) # new entry $cell
9 & & ' $cell1 $cell2, new label $cell
) +) ), :
$%, $! &
merge $cell1 -> $cell2
! # '
$%, $cell2 !' % ! . -
! ! +% $cell1 ') & & $ !
, ' ! & % '). %! ! . '
& 9 ) ') $cell2 $ !) %)*
, ') $cell1 $ # .
! ) #! ') . F $! ! $ & )
#. F # $ '. +
$! ') $cell CRL , $ & ).
@mark 3 ! ‘@’. I , & ' .
set mark @mark -> $cell , $ , ! '),
' $ % , !&
% & $% ) !$% ), ! «\d+» ( % #"). F
& '&, $ $% ') - %$ DRL matches.
! ) #! ! ! ) ' ) $ &
% & $'! # ' & :
. when cell $c : text matches "\\d+"
!' , ' ! then new entry $c
% + ') , !
! "# C1 C2 C3
#!. ,
# $ & «*», a = 1 b = 2 c = 3 a b
ɚ ɛ
« » « », ') $ d = 4 e = 5 f = 6 c 1 2
& -& & !% g = 7 h = 8 i = 9 d 3 4
! + :
. 3. J') )
@head, @stub @body. %$ -&
«'=$'», «'» ),
!& '& +& $
«$'» & 3 (ɚ); $!' )
$ % ! '), & !
# $ : '
+& '.
! 3 (ɛ).
, #& !& . 2, ')
# $ $% ': !
(‘1’,...,‘6’), $ # (‘a’,...,‘d’) !& #& ')
(‘e’,...,‘h’), $ & % & .
% ) ') & . , # . 3, ɚ, '),
(, + CRL ) ) ,
') $c, ) ! & «'=$'». - ,
! $corner, 3 ' «'» ), «$'» 3
@RowHeading, ! + % - ) '),
$! ) & . $
$ : $ ' «'» & ) $ '
«$'» & ' $ :
when
when
cell $corner : cl==1, rt==1, blank
cell $cell : rt>1, $t : text
25
then #& ') % ),
new label left($t,'=') -> $cell . 4, ɚ. F # !&
new entry right($t,'=') -> $cell $ ) . 4, ɚ, +
!% %$ , ' ! % ,
F #! (. 3, ɛ),
" ! $ ' *, ,
" # & $!&.
$ ') &
, ' ') ,
:
, ) ,
, %$ % + CRL when
$ +& : cell $corner : cl==1, rt==1, $t : text
when label $label : cell.cl > $corner.cr
cell $c : cl==1 || rt==1, !blank, then set category token($t, 0) -> $label
$t : text U%, CRL "# token $ +
then $ $t, $ $ ') &
new label extract($t, "[-]+") -> $c $corner. $% %$
new label extract($t, "[a-z]+") -> $c ' , # ) )
$label.
CRL "# extract $ $
$t ') $c '%, + . F !%
!, # ! %)
. $label1 ') $ $label2, "
& ( ) ) :
% & . J$! CRL
set parent label $label1 -> $label2
$# .
$ & $label # , ' , $!
! ) ) $category, ) ) * ,
') : ! % ) .
set category $category -> $label
) $ * #
#!.
, !
6 , $ " !
! category_name, +
$' + $ . V
:
+ %: $label1,...,$labeln,
set category category_name -> $label
$label1 3 %, $'
U%, , ! ) $labeln ' $' & $ -
$ & , ' .
! ) #!. V !& $') !%
+ , $! ) $! , & '&, + !
$label. ', $ $' ) %
category_name, ' %& & .
' # $label. *)
. 4, ɛ. , '
a b $ ,
c " ! ,
& . #
A
a1 a2 . 4, ɛ !) %
B ········c11 1 2
!) '! . + CRL
········c12 3 4
b1 1 2 ɚ ɛ !& )
% +& :
b2 3 4 ········c21 5 6 when
b3 5 6 d cell $c1 : cl==1, $l1 : label
cell $c2 : cl==1, rt>$c1.rt,
········d11 7 8 indent==$c1.indent+4, $l2 : label
. 4. J') & no cells : cl==1, rt>$c1.rt,
& ): ‘A’ , " !& rt<$c2.rt, indent==$c1.indent
$ ' * (‘a1’, ‘a2’), ‘B’ $
then set parent label $l1 -> $l2
(‘b1’, ‘b2’, ‘b3’) 3 (ɚ); & ,
" ' 3
(ɛ). * . !& #&,
%, ' %&
) , %$ %
F $!
! ! & ',
) ) > . !&
, , -
26
. , !& #&, ) !, ,
+& $% $ & , % )
!&, ' «*», . - , $label $
!) # « », " ), # & $entry
% ) . & '&, ! . - ,
%, ' , ! $ , ! # %
' ) ( #), ) & $entry ), ! %
. > $#.
$ % $ CRL (, # & $entry
+% . F $label1 $' , $! !
$label2 !% ! ) ! label_value, $ ! )
+ $ : $category ! % +) " :
group $label1 -> $label2
X . 6 $ add label label_value
( ) from $category -> $entry
) , - ', !
>. V ') $! $' label_value
, ! 9 . $category. 6 ) ,
', $> ) . U ,
# ) ), ) $ $!
, ' %! $ ) & $entry.
) ! ! % ) V+ # % &
. - , $ ! $entry #"# ! $'
- ) !, # )
label_value %$
). ! $% $ ) !
, $ ! !
$! )
category_name, ), !+)
$ * # #!.
>, $ :
( !, !& !
, !
add label label_value
. $ * , from category_name -> $entry
) $ & $ %
' ! , ) '
# > . : + #!
$! category_name. V
, , ' #& . 2,
, $> .
, ! $ $
F ) $
! #,
%$ ) # !*:
, $% + :
$ $'
when
label_value.
cell@RowHeading $c1 : $l1 : label
%, ' " !
cell@RowHeading $c2 : cl==$c1.cl, $ $ % $ ',
cr==$c1.cr, $l2 : label & CRL &. 1
then group $l1 -> $l2 $ % ), % '%
$% $! ! , !% $
+ ! , , ' ! ) #!. ,
# . 2: {‘e’, ‘f’} {‘g’, ‘h’}. ' # '% $
$% , !& $ !% $ , ' !
$ # - !& & ! &$ ) #)
) . $ , . ',
# $ '% " #
CRL .
. F #
F ! #
$! & $entry ) $label:
& ). $ $ '
add label $label -> $entry $ + CRL :
- ! $ & # $ ) ) “tons”
: & !% $ % $ “unit”:
) ) ) . * - when entry $e
%)*) # then add label “tons” from “unit” -> $e
# #.
27
a b Y , '
㜿∞ἲ බ * + #
c 1 2* & $! ! $!
Z 1 2
d 3 4**
ɚ ɛ ')& ! 0,
ఞ㤿 ୍
! ! & 1.
*u \ 3 4
** v ୕ゅὪ ୕ ᅄ & . F %
! $! CRL
. 5. (# (‘u’ ‘v’) 3 (ɚ);
%!& #), & :
') ,
$ '), $'
& 3 (ɛ).
& ; " ')
; ! ' ) " #
#, $ ) . 5, ɚ, & .
‘2’ ‘4’ $! ‘u’ ‘v’
'$ ! ‘*’ ‘**’. ( 5 + -#
% $
#% ) . + CRL (#! + ($)
!% %$ !& #, ' $!& ).
' ! # % & , $% % ' )
$! $ & , %$ !, & % !, , ' !
! % $ % & $ #
‘*’: %!) # !& .
& '&, - $ " %
when
#& &
cell $footer : rb==table.numOfRows, ($ $)), ' ! %
$fn : text & ! .
entry $e : cell.text matches ".+\\*+", [15] - % $ , '
$ref : extract(cell.text, "\\*+") $ # #
then $! % $! DRL
add label between($fn, $ref, '\n') Drools Expert. F!) $! +
from "Footnote" -> $e $' # !&
, $ #
) ' - , ! $* # ' %$ % % '%
« » #! $footer, '), $ ). -& $ )
$fn, & $e, $! ') $ -
!& , +) $! CRL, DRL #&.
! (.+\*+). - ! J$! CRL ! + !
(\*+) $ ) $ref . !& $' DRL #)
) ', CRL "# $ $ $ " % $#
$fn, & $ref ! $ # #. -
CRL % " DRL.
(\n). !) &
!& #& ') % %$ # #
% & ) (. 5, ɛ). 6 !& '!& !&. J$!
- ! ' $ 3, !) ", CRL % $
+) '), '!& ' ETL
: & ! , ) " # ' ) " #,
! & . 1 $ '% +) - !& #&, &
& & # , -#&.
-& &. , + F%)* $
# & ) & $ ,
& $!'!& #, $! "#)
!& . 5, ɛ: , ' '!& !&, $
when '.
cell $c1 : containsLabel() ! " )
cell $c2 : containsEntry(), 88 ( _ 15-37-20042 )
cl == $c1.cl || rt == $c1.rt $ 8 ( -
then 3387.2013.5).
add label $c1.label[0] -> $c2.entry[0]
add label $c1.label[1] -> $c2.entry[1]
28
. [13] Pivk A., Cimianob P., Sure Y. From Tables to
Frames // Web Semantics: Science, Services
[1] Embley D.W., Hurst M., Lopresti D., Nagy G. and Agents on the World Wide Web. 2005. Vol.
Table-processing paradigms: a research survey 3, No 2-3. pp. 132-146.
// Int. J. on Document Analysis and [14] e Silva A., Jorge A., Torgo L. Design of an end-
Recognition. 2006. Vol. 8, No 2. pp. 66-86. to-end method to extract information from
[2] Embley D.W., Nagy G., Seth S. Transforming tables // International Journal on Document
Web Tables to a Relational Database // In Proc. Analysis and Recognition. 2006. Vol. 8, No 2.
of the 22nd Int. Conf. on Pattern Recognition. pp. 144-171.
Stockholm, Sweden. 2014. [15] Shigarov A. Table Understanding Using a Rule
[3] Embley D.W., Tao C., Liddle S.W. Automating Engine // Expert Systems with Applications.
the Extraction of Data from HTML Tables with 2015. Vol. 42, No 2. pp. 929-937.
Unknown Structure // Data & Knowledge [16] Tijerino Y.A., Embley D.W., Lonsdale D.W.,
Engineering. 2005. Vol. 54, No 1. pp. 3-28. Ding Y., Nagy G. Towards Ontology
[4] Gatterbauer W., Bohunsky P., Herzog M., Krüpl Generation from Tables // World Wide Web:
B., Pollak B. Towards Domain-Independent Internet and Web Information Systems. 2005.
Information Extraction from Web Tables // In Vol. 8, No 3. pp. 261-285.
Proc. of the 16th Int. Conf. on World Wide [17] Wang J., Wang H., Wang Z., Zhu K.Q.
Web. New York, US. 2007. pp. 71-80. Understanding Tables on the Web // In Proc. of
[5] Hurst M. The Interpretation of Tables in Texts. the 31st Int. Conf. on Conceptual Modeling.
PhD Thesis. UK, University of Edinburgh. Springer-Verlag. Florence, Italy. 2012. pp. 141-
2000. 155.
[6] Hurst M. Layout and language: Challenges for [18] Wang X. Tabular Abstraction, Editing, and
table understanding on the web. In Proc. of the Formatting. PhD Thesis. University of
first Int. Workshop on Web Document Analysis. Waterloo, Waterloo, Ontario, Canada. 1996.
2001. pp. 27-30.
[7] JavaBeans Specification 1.01 Final Release,
http://www.oracle.com/technetwork/java/javase/ CRL: A Rule Language for Analysis and
tech/spec-136004.html Interpretation of Arbitrary Tables
[8] JBoss Drools, http://www.drools.org
[9] JSR 94: Java Rule Engine API, Alexey O. Shigarov, Viacheslav V. Paramonov
https://jcp.org/en/jsr/detail?id=94 The paper discusses issues of the transformation of
[10] Kim Y.-S., Lee K.-H. Extracting Logical information from arbitrary tables presented in
Structures from HTML Tables // Computer spreadsheets into the structured form. These tables
Standards & Interfaces. 2008. Vol. 30, No 5. pp. contain no relationships describing their semantics.
296-308. However, only after the semantic relationships are
[11] Nagy G., Embley D.W., Seth S. End-to-End recovered, the information from an arbitrary table can
Conversion of HTML Tables for Populating a be loaded into a database by standard ETL tools. We
Relational Database // In Proc. of the 11th IAPR suggest the CRL rule language for table analysis and
Int. Workshop on Document Analysis Systems. interpretation. It allows developing a simple program to
IEEE. 2014. pp. 222-226. recover the missing semantic relationships. Particular
sets of the rules can be developed for different types of
[12] Pivk A., Cimiano P., Sure Y., Gams M.,
tables to provide the transformation step in unstructured
Rajkovic V., Studer R. Transforming Arbitrary
tabular data integration.
Tables into Logical Form with TARTAR // Data
& Knowledge Engineering. 2007. Vol. 60, No
3. pp. 567-595.
29