=Paper=
{{Paper
|id=Vol-1536/paper9
|storemode=property
|title=
Автоматизация сбора информации о научной деятельности для тематических интеллектуальных научных интернет-ресурсов
(An Automatization of Collection of Information about Scientific Activity for Thematic Intelligent Scientific Internet Resources)
|pdfUrl=https://ceur-ws.org/Vol-1536/paper9.pdf
|volume=Vol-1536
|dblpUrl=https://dblp.org/rec/conf/rcdl/ZagorulkoAS15
}}
==
Автоматизация сбора информации о научной деятельности для тематических интеллектуальных научных интернет-ресурсов
(An Automatization of Collection of Information about Scientific Activity for Thematic Intelligent Scientific Internet Resources)
==
- © . . © . . © . . .. !, !" zagor@iis.nsk.su ah.irishka@gmail.com Alexey.Seryj@iis.nsk.su " & ' – % *(, $ + * ( *' " # " $% &" " ' * . &$ , ' % ( "* % % *'%, ( - , "&($) *'$ '$ ( *, &%) % ". ' * , ( &" " "" ', ' * *$% %)% & " *, . , & * "* [3], + + & . % " % ( & & *' " ' * ( ', " % % ( % * " *, *( / ' * "-%$) & * (% & ' . 9 % ', "* $)% % * (% ' % + ( * . # / & % % , % % " + & ) ( ) &%) * ($ ' * ( *"$% " * (% [1, 13], + ', " * ( % " & " % & - . *, * ($ ' " & & &+ & % / " % ( 22 (& 3 13-07-00422). % , % " ( $ & "- "- 1 ', %% + %. " "&(% / & 2 *%, &* $) % " % *, + :( ! & % " ' & % ( ' $ , "&($) $ % & ( * *'$ '$ ( * & %. ' & " 6 % % &" " & + *, + / & '&'% ( (& '$) "" . ( - ( ! ) [12], "&($) ' $ ; * ! % %% ( $ &+ ( % % ( ONT ) [4], %, % & " *. &% &% " *, & 8 & *% ! ' " , ( " & " "" & '% & . "- C + R , * % & % ' "- XVII !" " *, ' DAMDID/RCDL’2015 « # "" . 6% '% % ! # », $ , 13-16 "- - ( 2015 , & ' "- 62 & %$% "- ( %) % ), .. ! . (%, &) $ "- -% ( % &%% & %$% & * *&. :* * & %% SN I C , I R ! , &% & % &% IC I C1 ,..., I Cn ! – + &) $ ( . > % / + &% % & & "- , .. / *& % C, ' , & ONT , .. ! . i : I Ci Ci , Ci C , 3 $ # IR I R1 ,..., I Rm ! – + / *& % R , & + *( " ' % ! ONT %*$) "- * & %% " *"* + I C . * ' &" & % . # (, " % ! * " '$ " *'%, & , *%* , ($) * & " '%, - & * & )%, & ! . 8 *, : " * ! , '% + " & ( - -', $) * ( $ , *( . * ( % " * * . # %* / &( &% , &*( % ' "* & * && % &% " * ! %) % * (% ', & % ( " ( & (. % . # (, &, [9]), & &, "* $)% + , . # / " , , % + & ) " , & * % , + & * *" " "" *( % " * ', / " & * &* , * &*'$ - . "- %, & * ( , ( & & * % . 6 % & % ( " ( ' " % - [11], *" + , , , &% & ( *. # , , . + % " * & , & % ( - + [2], "* % '& % &%, & & " & ' , " & * % + ) * ! . / " ""( , + & [6, 7], % %% !" , " & "* $)% %. " %* " ' % ! $( Dublin core [5]. $) /&: % *( &% x " * ! *(, % % &*( ! , - . % $( + &% web- x * ( ' * , * $) / , - . "" ', *" x & ( ' " *. ! . ! ( # & + * % "% '% & ( & " ' * *% ' , $( $) & (. .1): ! , + + & , * (% ', & " $)% *% ' ! , "" (& , + "* - (>6 ( , web-). ). * ! + $( %*( * , +) " * %* 63 .&. ( % / & * % &' &- ). 6 % ( ( * ( & & & * $% , &) $ ((% ( & & & ). ' *& ( %% *( + & & *& q ' ( ) d & 1. n & & qd ¦ q i ud i cos(T ) & & i 1 (1), q u d n n ¦ qi u ¦ d i2 2 i 1 i 1 n – (( ( ), i – &*'% . 5 . 1. & " ' 6 % *& % ! "% 4 % - '% * ( , *', '', & ', ! " * & *, ' ( . A % % ! , *& %% " * , * / ( * % >6 - '% & , *'%, &, . + , &) >6 , '% & " '%, .. " "- %*% ( ) , "- "* ( % , ("- ) & $) + '% ( , % , + '%, "% % & %% ! "- +% !" . . 6 % + * / *% , ( *& >6 + * (% ', $($) " & % % ( $ – / & " , . 6 % " *, ( &% & * (% ' – & . / " (% * ( @ & & % " & *% * - & & * ( &), + *&, * (, & / &. * , @ * (% ' ) % & %$) &%% " * - , ( & , *. : *& $% % * >6 . 6 %* , & * ! . @& " & * ( *& % * & ! (HTML, DOC, PDF, TXT .). ! HTML &( $. / & % %% % & % ")% & Google, ; ' , & Bing (* & , .. * (% ' " & * * & & $) HTML-'. ' " [10]. 6 % " (% * HTML-' & %% DOM- ' & %% DOM (Document Object Model), & * ( , $) &" & % %% % *& % + + ( (, HTML- ( & '. 8 ') " "- [8]. ! $($ " $ ( ( $) " & %% * ( ) *& HTML- DOM- + ' * ( '. / $($% & / " '. ( &- , .. , ) * , &, & , ") &" 64 .2. * ( ' & * " I" & % " XML- , A+ * / " + & % % "- , " && (Marker), * , * $) *$) , +) & + "- , % * $ '$. @ , && " HTML-'. # " % & " Class, % + & * ' , &$) "- & %$) *$% ""( , * $) " & . " * $) Term &* % * -'. * , * $) * #+ , ( '% )%, ' (&, & & " ' & %$) % & * ! , ( & ); & PType * & + " * * ( &". , + & % !&, '% & , + " * (&, $ * ), & & , * & FragType – & , * *' & & " ', " * % '% (&, " &$) & . 6 % + * / &" '). & % * , ( % + %* , % " . & * ! , *% " ! . 2 & " % . 6 % % * (% ' & (- & * $% $) / & . 8 " &* % * %* , & % * " , %*( * , + «!*» «'%», + & + / &. & * « " '% & » «J( %*( " &* * & », .. "- , &$) '$ * - , & " ' & ( % '% & & . A+ " , &*( % %* , * - , +) * (% ', &% " Class '$ * ( %* . + " " (Attr), (Relation) (Object). 65 A " * , " + 6 , * + *% ""( , " * " & " , * '$ * , *+ $) ': &) $ . !*% ""( x " " ( " , *$% $) " " ""$% . / (Attr Object) & & engine. " , " &* % & : * ( "- % , +% + & % % &-* , ) . " % &% "- x % " " * &' & * % * ( . !&, ""( , & "" *' &*' &% ) , & ' " "- + . / * (% * % $) : # , ""( '% &"* % , # , # " . , # , # , #"" # ; x " " % %% , & – $ # $; % *( $) & – # , # . "- " * ) '. % & , & &' * (% ' * HTML-'. : "*, &' "" " &) $ $) (% & PType FragType %$% , * " * ( / HTML- ""$% &' ""( , ', $, * , " . 6 % * " . / % (" "" &, "- * ( %* * HTML-' " "- . % / . A " * , ' & %% DOM-, !&, & «!' + $ & %* » (http://www.ruscorpora.ru) '. ( * $ « & » + &%, ", & ', & & , * « ( & » – (, , *) .) '$ & *'%, % / ( %% , ( $) & , * «& " '» – , «& '», «* '» '$ & " '% & & .. .&.) & %$ % *( I" , & (. * (% '. 6 .2), &* * ( / '$ %$% * ( " . ' . 6 % * (% / / % * (% ', & * $% / . : , / , %$) & , & , * $)% $, ) % & & & % % , DOM- ', & &, & " '% & & , &+ & , ) & *'%, ( $) & , ) . 6 / + % & * $% ""( " , &' % '. & % * (% ' , ( + & & * " ( * " " , $) "* "- "- '. &%% , , , . * HTML-' % / & %% & ! .2 & * & * &%) " " ""( PublicationList PersonList. * Class * ( ' &*( % *" & & " ', / " . 6 % + " " – % "" ' &. (Class, Attr, Relation, Object) &) $ &$) * $% 6 & $) , & *% & FragType. : + " , / ) ', % A " * , * % * HTML-', % &' " - '% & %% * +% + * %. ( ' "- , .. . '$ & ( ! & % *% '. 66 ( ( *% ' + %* (*+ & ) , ' + E ij " g1 . " "&( + %* i ( "% * + V "- ' ', + $)% ! . "- ( % , V ji , $) l %*, & , * $% i " , %* %*% * e (.. % %$)% / *& % "- . : "*, *( *% + ( % i ' * $(% %*$) V j + "- * g1 ), & "- ) $) ! ( , & ( & l 1,2,, n & * * ( ' * . 6 % i * : (1) + V & , (2) + / + ' "- i + & ) V + " / (3) ! . i i + V + / V0 . )% ' i i "- * '. ( *(, ( v 2 V0 — ") * *(, i g G , "- v 2 , * $), % "- * *( " *+ "*, ' ! *( * , " V0i . # v 2i & / &% & ( ! & '$ " * + $)% . g1 . " & g 2 &%$% G V , E ! — ! , *$% " (, & ) ' ' g v, e ! — "- , * ( * "- , " + v 2 & . - . #( " ") "* ( ", *% ' "- .. *( " &% % & ") "- . & / g * % " / . # * "-% &% % & g1 v1 , e1 ! g 2 v2 , e2 ! , , " + + & $( , ") G , ) / — , & & * " . *( . : + / /& + , ( ' & , + "- - * *'%, &, '% g , %) g1 , g 2 , & " '%, " + * '$ ( , .. - ' , (, & .&.), % + (% G . "- + " & % % ! "- ' (, !" . & $ & " %*, "%* % "- * 7 &' , (% + " %* "- + , :( ( %)% ! . - &* %$ % *( %, " % # & $ $ "&(% & ' & %* + & g1 g 2 . $) *. / !" * & "- - / & *% + v 2 ( i i 1,2, | v2 | ) & g 2 . ! &% $ * & & '. "- v 2i %* & g1 6" % & + * ( *' &' " '. 6 % "-% e i {e1i , e2i ,, eni } , / ' *"% & " $)% *( "- v 2 & i ' * . " *( " + & ! * & & *" + "- Vi {V1i ,V2i ,,Vki } * * (% ' & , i &, *'% "%, $(% G . A+ * "- V j , $ ( , & $) " ""( , 67 * $) * ( ' *% : & ( & " '%. . :. 312. 3 5. J& , ( % . 2008. ( C. 114–119. [1] Chen J., Chen H. A Structured Information [12] .., _. >., I Extraction Algorithm for Scientific Papers based on #.A., A .. A'&'% Feature Rules Learning // Journal of Software, Vol. ( ( 8, No. 1, January 2013. P. 55–62. - // : XV # ( ' RCDL’2013. 14-17 [2] DeRose P., Shen W., Chen F., Doan AH, %"% 2013 . ; : ;_J, 2013. C.57– Ramakrishnan R. Building Structured Web 62. Community Portals: A Top-Down, Compositional, and Incremental Approach // VLDB ‘07, September [13] f #.#., @ ' .., f% f.!. 23-28, 2007, Vienna, Austria. P. 399-410. : " * ' % & // @ [3] Ferrara E., De Meo P., Fiumara G., Baumgartner R. 9 + (-( Web Data Extraction, Applications and ' « ' Techniques: A Survey // Preprint submitted to , &* Knowledge-based systems. June 5, 2014. 41p. "*» ( 4): 9 I, 2010. C. [4] Guarino N. Formal Ontology in Information 218–222. Systems // Formal Ontology in Information Systems. Proceedings of FOIS'98, Trento, Italy, An Automatization of Collection of Information June 6–8, 1998 / Ed. N. Guarino. Amsterdam: IOS about Scientific Activity for Thematic Press, 1998. P. 3–15. Intelligent Scientific Internet Resources [5] Hillmann D. Using Dublin Core, 2005. http://dublincore.org/documents/usageguide/ Yury A. Zagorulko, Irina R. Akhmadeeva, [6] Labský M., Svátek V., Nekvasil M., Rak D. The Ex Alexey S. Sery Project: Web Information Extraction Using The paper considers the problems of information Extraction Ontologies // Knowledge Discovery collection and extraction for thematic intelligent Enhanced with Semantic and Social Information. scientific internet resources providing the Studies in Computational Intelligence. Berlin: systematization and integration of scientific knowledge, Springer-Verlag, 2009. vol. 220, p. 71–88. information resources and methods of intelligent [7] Saggion H., Funk A., Maynard D., Bontcheva K. information processing related to certain area of Ontology-based Information Extraction for knowledge, as well as the content-based access to them. Business Intelligence // Proceedings of the 6th The approach to automatization of collection of international The semantic web (ISWC'07) and 2nd information about scientific activity in the given Asian conference on Asian semantic web knowledge area combining metasearch and knowledge conference (ASWC'07). Berlin, Heidelberg: extraction methods based on ontology and thesaurus is Springer-Verlag, 2007. P. 843-856. proposed. In accordance of this approach for every type [8] Stenback J., Le Hégaret P., Le Hors A. Document of entities (ontology class) the specific methods of Object Model (DOM) Level 2 HTML Specification information collection and extraction adjustable to // W3C Recommendation, 2003. http:// knowledge area and types of information resources is www.w3.org/TR/2003/REC-DOM-Level-2- developed. HTML-20030109/ Each of these methods includes a set of patterns. In [9] Zhai Y., Liu B. Extracting Web Data Using these patterns, for every kind of extracted information, Instance-Based Learning // Proceedings of 6th markers defining its position are given as well as the International Conference on Web Information engines implementing the algorithm of the analysis of Systems Engineering (WISE-05), 2005. P. 318– the corresponding fragments of Web pages and 331. extraction of the required information from them. These [10] .., .., patterns are generated on the basis of the ontology. To !.#., .., .., I #.A. improve the recall of information extraction, the $ ( patterns use alternative terms in different languages ' - from thesaurus (synonyms and hyponyms) to describe // # !_J. %: ' the markers. . 2013. :.11, & 4. . 5-15. [11] .. *'% " ( ' " - % & ( * // 68