=Paper=
{{Paper
|id=Vol-1536/paper14
|storemode=property
|title=
Исследование специфики применения алгоритмов тематической сегментации для научных текстов
(Specifics of Applying Topic Segmentation Algorithms to Scientific Texts)
|pdfUrl=https://ceur-ws.org/Vol-1536/paper14.pdf
|volume=Vol-1536
|dblpUrl=https://dblp.org/rec/conf/rcdl/BoyarskyGDKA15
}}
==
Исследование специфики применения алгоритмов тематической сегментации для научных текстов
(Specifics of Applying Topic Segmentation Algorithms to Scientific Texts)
==
© . . © . . © . . - boyarin9@yandex.ru natfed@list.ru graziokisa@gmail.com © . !. - kanev@emi.nw.ru Assoul@yandex.ru '"30 -'0 , ( '0 ), 2 1 ( $+), #& , ( , () " " # $% & . -. 6, $ ', '"3 , $ # ' # # ( #% '")# ()# #. &#% -)2) -#) +) ( *$ #'" +) $'"+) ', '-02 , -' ( [16, 26, #& , +), $ - # , 30]. *# $) <# $ $ (# '-#) $ '0() # ' (1 – 2 #& ), $'''") )) &#) ( $+' $ #" -' , #& +), ', 1 , $& '"), $-. ( ' + ( #-), $ ' # ) TextTiling, ' +#2 ,', ) . $. $'"+02 ''"0 &#% 0 + #1- - # (# . # # - $ '- ' ( #% $-' 0 -$ $ ()# ()# , $#, +# '"+2 # (+ , - *&& " , ' ( $) #1- # , &#% $ # , $) ". $-') $ #'") -# ) . ' #, ( # % $# #% -' , -' - +' (), +). $ # +) $- # - % )# -' $-1-, ( $-'0( 3 , $ )# # – $ #, ## ' ( , ($-', ' & , ' '0()# '# . +'" +) 2 $)3 ( $ ) +$ $'"+'" $'( #% . ' -# %' #, # )1- (0 2'" ' ) $ , ' 1 )) + %) # #'" ( (# '0(), 6&& " '-'" ) ', $ )# '1 " %'" ## $-$-' $' , $-' # , '), &#% ), ( . 1-# -#. # +#, ,- # +% &#% $ ( , XVII (, 0 (-", 3 +-( DAMDID/RCDL’2015 « # + -' -# $-#) (# , !" », # , 13-16 %' $'"+'. 6 +-( 2015 96 '" $ $ (# ( , $ - &# )1 +#),. ( # ), +),, $ '"3 202 , ' # )# $1 $'# )' + $ '- . $. (1). !' #) * $ #1 +-' " 2# # ' $ - $$). !' #) $ $$) -( '"3 ( ' ' # $'"+0 ''"0 &#% 0 # ( #% ( , + #1- - # (# . + $ - 1). -, $ ', - + ' +), ' # * -# 0 -(0 *&& " $$) – TextTiling [11, 12,] – + +-), ,, $-'02 , '-02 , 3: % 0 -'"), $-'1 ' x )$' '## +% -' , ), &# + +), ' $-'; # +#, $+ -$' % (#., $ #, [3, 5, 9], ' $'-'"" + N ; '"3 , ), $, [4, 7, 21]). 1 x $'-'") ) $$ 0 # +'") $ # * , ' # $-$-'1 -' W 1-; '")# ()# # $-') + , &# ' +## k - ( $ ( - - (., $-$-'1 , ) -'" 3# $ #, [5, 9, 11]). $'"+ '"+2 3# s " # $% & $-$-'1 . +# ' # s=1, $ # ' # # ( .. 0 (' $$ 0- #% '")# ()# #. - (W*k)- c $$ W- - ( *$ #'" +) $'"+) (W*(k +1))-, +# $$ W- - (W*(k + #& , +), $ - # , 1))- $$ (W*2)- - (W*(k+2))- $ (# '-#) $ '0() . -. - $$- $'- $'''") )) $ $-'1 # $-$-'1 0 %; &#) #& +), ', x % ' ( ,1" - , 1 , $& '"), $-. ' ' M #1- (W*k)- '- + #" ( #% #)# # : , $#, +# '"+2 , ' ( $) #1- ¦ wn,i 1wn,i # , $) ", +) , x cos Mi n , 0 d cos M d 1 , (1) ' ( , - % -' ' +, 1 ¦ wn2,i 1 ¦ wn2,i $-'0( 3 , ' ( , . n n #% 1 -' )-' x - wn,i – n- i-# '. +# (', $&) &# -' -' $- ' - x ''") # ##) &% (1) -'" ) #. # 0 %) #1- ## ( ' # - 2 $ ' 1 3 $-'1 ' +%). !' #) $$) % 0 ( ' #) # ( $-' $02 , $ # #% + 0 + . , ' # DotPlotting [20] $'"+ $'- ,. $+ '- , [27, 31], -' ' + ' ( + -#) (), , $'-0 $) & , # $ # X Y '-) + , ' ( $'1 ; ' $+ % , x ('0) , # ( +# '") ) y ,- - () ), ' - ( ($'''") ): & ( $+ % , (x, y) (y, x). x '0) $ + – () $ *# +) #) +'" # #1), $-'1 , ; 0 -#, $'1)# -'" x # ( $ + – $'"+ - ' , ) $'"0 (. # , ' + , $ #)', -# '( $-' '- ; *## $ # # - + -, x +# '") $ + – $'"+ : ' # # +% $' ( # # ' # +'")# %,, ' # # +% $' ( # # # -' +#2 #)' #. C+ # * - ' $-302 $-'1 ; ' # 99 [4], # ' ()# +# +' + # ' ( ,1 x $'''") $ + – ' ( #1- # - , &#, +# $-'1 , )02 , + , #-# - # ( $## ( # $ +# ), ,- +) # #'" $'"0 * '0 $''' +# ), 1 #). 97 ' % 1. D ), ( +( ' & ( $ ( E+) C+# &# ( ( ($(. +) 1. Romme L' Art de la Marine, u Principes t Préceptes Generaux d &%+ 110261 l'Art de Construire, d'Armer , de Manœuvrer et de Conduire ds Vasseaux, par . R##e. La Rochelle, 1787. Chapitres VII, VIII. 2. C## , ' ') (' $ ', (02 161152 , 1 , $' 1- ' , ( ) $- # C###, $-# 1 !-# $&# % # ( ' 2. I" 1, 2. ' &%+ &' $ !'- J 3. -, 1793. ') 7, 8. 3. U-boat Williamson G., Johnson L.: U-Boat crew 1914-45/ ' 60563 ed. Osprey Publishing, Great Britain, 1995. 4. -- --) &' # . 1914–1945 / . '"#; 60560 . '. .!. '"% . – .: «+-'" !», 2003. 5. ) ( – 23800 -'1 %') - #- & % +), '"3 ' ( #, $' 1- ' # (#., $ #, [5, 6, 8, 13, 17]), ' +) - + #. $+'02 ,, $# # '0 + , # (0 -' $-,- ' # (" - $) + , $-' ' +#2 ,' $ 02 . $ #, -' ( (LDA) [2], 3 $'"+# #3 # + $ (2) $-' $-'0(" 3 ( . $ #, [7] LDA ' ($ #, WordNet) ' " ' + , ( ' #, 0 # *&& % , 102 $+'02 ) " - ' ( (# ' 3 #% . !' # TopicTiling [21], ''% -# ($ #, ) ' ( TextTiling, )( ' [5]. ' ( + $ (4) $-' #) ,- #1- - # -# $'"+" $ # + ## ## , - $'"+0 &% ' DotPlotting #) ' ( () (# ', () ,- # #1- ## - & #, $- '" [25]. ( ), -' 1- ' !' + #) ), #1 $#2"0 LDA. )-' " - $'#, $$02 , , !' #) * $ $+)0 # - 0 '"0 $ $ ), '(30 *&& ", (# '"3 (02 #. -$),, '"3 #-, $ (# ' 1 ( $-'), ' , + -' $-' ' + ## . -' ' +). +)) $$) " (02 $ ' + #) $ # '" +-( # ( -'1) $ -'1" -# -# [21], ( #% +() ' [3, 24]. -),, - 1 -" -' '" ( , -' # , +), '0( , -# %" ## +) -$ 0 -( $') ' '""0. # - ) ' ( 2 " $- ) ). , -" ,, *&& " , *$ #'" % $# $-'1), ' # ( + #% , 1), ( + +% + + $# # ( #% '"), (), + )# . . C# +) – $'- # $'( ' + ' #% # ,- ' #) , ( #% #1- ## . (#., $ #, [7, 14, 21]). + '"3 + , '1 #-'" ' ( , + ' - ## $-' ' $ ##, $ (# # #" 98 cos M 1 A 0,8 0,6 0,4 0,2 0 + + C . 1. ! +# () # ( 1) %) # (H) $ -, $, $- -' «Romme» 3 # " % 3.2 !" ( + ' # )' 3.1 # ! ) TextTiling, ' $+( ) +'' " + #& , ( ' + #, +# + 0 #. # , +), ( , ' , '-'"" 3 ' # +( &%+ ) $ - # . D $# $- $ ) )3 (#. $. 2). ( $-') '. 1. '-#) ,- *$ # " ' " '-02 $ 3' $'''") &#) #& $#) ' #: +# ' W*k []; +) ' ($+. 1, 3), 1 , +# $) #1- '# s*k []. $& '"), $- ($+. 2, 4). # , - *$ # $'"+) $-,- # $'# - ( -# -' ' $# -' ), 02 -' +), $+' ( # - +%. '-" +)) $# #% . I( +# 3.3 ' # &%+# +), )$''" ' % ( #% $#2"0 OpenXerox [18], $-'1 - # , # ( ' 3 # +) – $#2"0 ( recall-precision [12], - [19], ' + SemSin [29]. $ $-' -' #) PP Pk [1], # WindowDiff. 1- + 1- )' &# (0 , - -. ( ' +. ( $ $ #, # WindowDiff )' &# , 2 + 20 ), $'1 &), % #, &#, +), '( )# +# + &# ), $ + ($+. 5 '. 1). $- ' , %, $'(), % #)# #% 2''" $- ' ##, $-', '"+2 . R# +' ()# #-# ('. 2). ( ' , -12 , 3 ( $-') %), # 2 ( ' . - ' % 2. ) $- $'-02 '- [10, 15, 22] ) ' - +( $ - # WindowDiff. - N % '1 $-') (false positive, FP) '## +% + -' $-' $$2) (false negative, FN) %), ) N+ '## +% + (( +# + -'1) #" +0 +( #" + # 2 '"), +-( #% . # , N+++ '## +% + (( +# + # WindowDiff $+' (" ' ( 2 '"),, $ ''"), '' 3 $-' %), 1 $ - N++' '## +% + (( +# + +)3 +( 3 # (' % $-'0( 3 ' & . + * # $-'1 - #- & % # WindowDiff (#., $ #, [22]). 3# '- -' % ( #% $'"+'" ' F-# [28]: 99 2 P R # % s, W k $+' , ( '0) +( F , (2) $) , ' () ', 10 PR " & , $'), - % 02 %) #)'), &#, $ ( - (+0 (,) $ # $ - . 2). # #, ( * +'" $- )-#, TP TP -')# *$ #'"# '- [5]. P – (", R – H TP FN *# 3 , -'" 3 , *$ #, $', FP – ' ( '1), +( , FN – $) ' $'"+'". ' ( $$2), +( , H – ( ' -), % #. ( *' -' #) +' *$0 +# % #) # (A), #. . 1. #), &#, , % 1 # - - . - $# #% $- ' '-02 # +#. !' + '" +( #) (1) + # " # $ z. R( cos M, #"3 ' ) z, # ' " # #), . . % # (H). R# $- '" $' $'(), $'-'" C . 2. R( #) -' «U-boat» c . $ '" $- (TP) # ' " $- $ N+: – W*k=10, s=0; '( , - , $'-'", (A H) – (W*k)=20, s=(W*k)/2; )' #() ''") # ##) («$')») -' - 1 +% ' -' - , 4.2 " +%. ' $-' #('" )3, +# ' # )#, $'", $+' -'") TextTiling [11, 12] ( - %) ' + ' +, ( $# +% #) $'"+) & -' ), $) $' # " #, , 2 , $'1 $'(02 , % ' - $,- $ ,- '" '-02# +%. ' 1 3 $-'1 ' +%. 3 '") «$')» H #(' " '1) *$ #) $+' , ( $ # '" (FP), «$')» A – $$2) (FN). ' '")# ()# # ( ' + *# $-' '(3 +( F-#) " '( -' '" ('. 3). 6 + $ z " ' 0 - 1. #, ( $ # (10 2 '"),) # #, ( #) )3 # '( ) +# ' $ - % 0 ( #% '" $' 0 '"3 ( ' «'1), », $&# $( 0 +#1" #"3 (", $ '"3# (40 2.) ($ # +% ) $# ' # ' " , ( $ #% , # -' +), +-( '"0 %, ( $ - $- 0 + 3 ( $'), $'). # +'" -' ( «U- ( 1 -' (), . 6 boat» $- $ N+ $ - $'1 ' 2 - # ## $'"+ ) ' % 3, ) 1–3. ( # F-#) (2), - # " $ # +% , ' % 3. ' +# ' # ## – '(3 3 P R. S 1 2 3 4 ' #- + : , $ #, ' C+# ' [] 10 25 40 +% " ) *&& % ) #1- P R. F-# 0.06 0.04 0.03 0.17 4 & ! "! #) 1 '-' $-$'1 , ( #)') %) '- " %, +%. 1 +# ", ( 4.1 " ' $-') $ ( ) # $ *# $-. $ #, [32] : ( $ #'"), $# «...#1 +", ( +% (# – #% +# ' # #'") , & ( )-') *'# TextTiling [11] #- '"+2 , -12 - +0 #)'" ' (') +## (W*k) + k &#». - ), ' [23], $-$-'1 -' W 1-, $ -' $ &#'")# '02 # $) ' s=(W*k)/2. - 3 (+%# ' +-'#) $+' )-' " *$ #) $'"+ # +), 100 $-#) -#: %) +% # ( , ' # $-", -' 4.4 " +%) + $ +( -# ($ #, ,-1) ), # )' '-) +#1 '"3 +%) # -1" '" $ # +% $- + # +) $-#. . ( -, ( $ $- ' *$ # #) $'"+' & ( $-' $ ' -' $# -' ), $-02 +%#. ' + +), +),, #1 ' 3# +%) $ - ' " $-')# +) $# '-02#, -( '"3 + . ( , N+ N+ ( "0 +%) $-'0 '0( # ' (), ') & 0 +' ' $' , ), '0) $ + . N+++ ( ' 2 ', $-02 , '# -) $, ( $-' 102 . # 0 '" '1) ( ( ), C+'") ' + -# 0 + - '0)#, ( ( & ' ( +( F *# '( $ 0 $'''") $ + . & -' ' ('. 3, 4). ' + #), ,, $# # -), *# -'" 3# -' , $-', '0(' " ' () $'"+'" (' $ +%#. -1'") ': – , ,, U-boat, jacket, war – ' ,, 4.3 voile ($), poulie ('), mat (#(), fig (& , $-'02 # $## -' $ )' ), vergue ( ) – &%+ ,. 3 $- %) # -# R( F-#) -' $ #'" $ # ' " $ z. # , $'() -' +' (), #"3 (" P, . . $'0 $$ $- «$ '#» (.. -' , %, $' R, . . '"3 ( ' $- , # N++'), $ -) % 1 . # #, 2 '. 4. #'") +( )-') $ #'") " $ ( + F-#) $'1 )# 3 &#. # #, ( $'() ( . 2). $+' 3 *$ #), +( $' 0 +'"# , $ #'" +( z '" + $ , [3, 5]. $- . ' , ' % 4. $- , # N++' ('. 2), # #'") +( F-#) - 0 $ N+ N+ z=0.1…0.15. $-'0( # -- 0.19 0.21 0.11 -$' '"), (N++'), ' U-boat 0.24 0.17 0.19 $- $ +-' 4.5, # #'" Romme 0.46 0.44 0.53 +( F-#) - $ "'ܟ3 , C## 0.21 0.22 0.18 +( , z ( . 3). ) 0.60 !' + ' %) $+), ( (), P R , -' +'# 1 $-+'# +( #) ' $', '' $ # (# ' +. 0,8 +) « +)#» ( 0,6 + $'"+# ' . 1 # $ '" '"3 , ), 0,4 -1 + $ # ,-' -$' '"), # ( -' +( '" )3. 0,2 6 +'") ' $ $ " 0 z $#2"0 - ## #) ,- 0,1 0,45 0,8 0,05 0,15 0,2 0,25 0,3 0,35 0,4 0,5 0,55 0,6 0,65 0,7 0,75 0,85 0,9 0,95 #1- +%# $ # ( «Romme» ( . 1). C . 3. R #" P, R F-#) $ z $'3)# '")# ' # $+ (' $-+-'). -, ( -' $-', , $ 1 # ,-, - «$')», $ #) +) #1- ' + # $ #)' &## , 0 $1 , #,. *# ( -' , $-'# $ F-#, # +( '": 0.46 $ # 2 '")# $ 101 ( $ 0 z = 0.15 $ 0.53 F -')# $-'# $ ( $ 0 0,6 z = 0.1. 0,4 # +#, 1, ( # $-$( '" $'"+ '" 2 '"),, $ (# + '0( 0,2 (), ($ $ N+). ' # 2 $)3 ( #% 0 0,7 0,1 0,2 0,3 0,4 0,5 0,6 0,8 0,9 0,95 0,05 0,15 0,25 0,35 0,45 0,55 0,65 0,75 0,85 - ( $ ''"), ''. z &%+# '"3 *&& - -' $-'. C . 4. ' + «$ '#» «$ '#» -, F-# $ $,- «$ 4.5 ' '#» $)3 ' (# - +. + ' ) +, ( $-'0( " 2 $ '# # 3 , ' ( , 2 " " -' - 1 '(3 $)3 ( " (") #% , +'", $ #, ') ) #' $ (# * %'"0 #1 $'"+" # " ( -'" ', '" '" # [5], # ( #' <- " - # 2 ' ' & ' [21]. ' +) -1-). # #'"), +( F-#) '" # ,- # ( $-' ' % 5. )# -$ , $*# 3 , ' % 5. F-# $ ' + «$ '#» «$ *$ #, #) $'"+' # ( '#» ' & [33], -12 1680 ', $ «--- ) )# $-') ' 190 ). '# [29]. «C##» » ($) ') * ' & - $ '# 0.21 0.22 0.60 0 $ ## WordNet. '# 0.50 0.7 0.78 $-$''", ( ', $ -'12 -# ', $-0, -, ( , , «$ $ #, ' , , ,… '#» - +( '" '(3 +'"). «C##» ' , , , ( -' ), 1 $ «-- ». $# z=0.5 $) -0 C# # ,) $ #. +"## - - ( # , . . $+ % FN FP $-'1 + #1), +% $-0. «-- »: # +#, ( -, ( -' - 1 $ #'#), +'" $ # ( #% (), 1 % " , ! ,- +' ( &# $ " ' , $ -'")# '#, ( $+' +( '" *&& # - $'"+" $) + , $ 02 , - . ! . 6 $-'1 , # ( -0 5 ( ) 2" # , 2 #0 - ), ', $*# $ «$ '#» # - '- $% & $ # ,- '0. - ' # # ( #% '")# ' & # ' , , ()# # , +), $ - ' -1-), ' # , $ (# $ '0() $'''") , , , ! – ' . &#) #& +), ', , ( ' -' 1 , $& '"), $-. ( 2 '"), $ )( ' $ ' # ) TextTiling, ,- «$ '#» $'(# cos M = 0.71. $'"+02 ''"0 &#% 0 + #1- - # (# . '- . 4 $+ + #" F-#) ' ( #% , $ -' -, $ #% – $#, +# '"+2 , ' ( «$ '#» «$ '#» (.. $- $) #1- # , $) ". $ N++') $ # «C##». $-') $ #'") # % $# #% -' +' (), +). $ # +) $-1-, ( $-'0( 3 , ' ( , 2 $)3 ( #% . 102 * access. In SIGIR ’93: Proceedings of the 16th annual international ACM SIGIR conference on [1] Beeferman, Douglas, Adam Berger, and John Research and development in information Lafferty. Statistical models of text segmentation. retrieval, p. 59–68, New York, NY, USA. ACM Machine Learning, 34(1-3), February 1999. Press. [2] David M. Blei, Andrew Y. Ng, and Michael I. [13] Anna Kazantseva and Stan Szpakowicz. 2011. Jordan. 2003. Latent Dirichlet allocation. Journal Linear Text Segmentation Using Affinity of Machine Learning Research, 3, p. 993–1022. Propagation. In Proceedings of the 2011 [3] Anja Habacha Chaibi, Marwa Naili, Samia Conference on Empirical Methods in Natural Sammoud. Topic segmentation for textual Language Processing, p. 284–293, Edinburgh, document written in Arabic language. 18th Scotland. International Conference on Knowledge-Based [14] Anna Kazantseva, Stan Szpakowicz. Hierarchical and Intelligent Information & Engineering Topical Segmentation with Affinity Propagation. Systems - KES2014. Procedia Computer Science In Proceedings of COLING 2014, the 25th 35 (2014), p. 437–446 International Conference on Computational [4] Choi, F.Y.Y. 2000. Advances in domain Linguistics: Technical Papers, p. 37–47, Dublin, independent linear text segmentation. In Ireland. Proceedings of the 1st Meeting of the North [15] S. Lamprier, T. Amghar, and B. Levrat. 2008. American Chapter of the Association for On evaluation methodologies for text Computational Linguistics, p. 26–33. segmentation algorithms. 19th IEEE [5] G. Dias, E. Alves, J.G.P.Lopes. Topic International Conference on Tools with Artificial Segmentation Algorithms for Text Intelligence - Vol.2, Jan. Summarization and Passage Retrieval: An [16] David YW Lee. Genres, Registers, Text Types, Exhaustive Evaluation. AAAI'07 Proceedings of Domains, and Styles: Clarifying the Concepts the 22nd national conference on Artificial and Navigating a Path through the BNC Jungle. intelligence - Volume 2. p. 1334–1339 Language Learning & Technology. Vol. 5, No. 3, [6] Lan Du, Wray Buntine, and Mark Johnson. 2013. p. 37-72, September 2001. Topic Segmentation with a Structured Topic [17] Hemant Misra, Franc`ois Yvon, Olivier Capp , Model. In Proceedings of the 2013 Conference of and Joemon M. Jose. 2011. Text segmentation: A the North American Chapter of the Association topic modeling perspective. Information for Computational Linguistics: Human Language Processing and Management, 47(4), p.528–544. Technologies, p. 190–200, Atlanta, Georgia. [18] POS Tagging Open Xerox. [7] Jacob Eisenstein/ Hierarchical Text https://open.xerox.com/Services/fst-nlp- Segmentation from Multi-Scale Lexical tools/Consume/Part%20of%20Speech%20Taggi Cohesion. NAACL '09 Proceedings of Human ng%20%28Standard%29-178/ 2 Language Technologies: The 2009 Annual – 14.03.15 Conference of the North American Chapter of [19] Ponte, Jay and Bruce Croft. 1997. Text the Association for Computational Linguistics, p. segmentation by topic. In Proceedings of the 353–361 First European Conference on Research and [8] Jacob Eisenstein and Regina Barzilay. 2008. Advanced Technology for Digitial Libraries. Bayesian Unsupervised Topic Segmentation. In [20] Jeffrey C. Reynar. 1994. An automatic method of Proceedings of the 2008 Conference on finding topic boundaries. In Proceedings of the Empirical Methods in Natural Language 32nd annual meeting on Association for Processing, p. 334–343, Honolulu, Hawaii. Computational Linguistics, p. 331–333, [9] Dominik Flejter, KarolWieloch, Witold Morristown, NJ, USA. Association for Abramowicz. Unsupervised Methods of Topical Computational Linguistics. Text Segmentation for Polish. Balto-Slavonic [21] Martin Riedl, Chris Biemann. Text Segmentation Natural Language Processing 2007, June 29, with Topic Models. JLCL 2012 – Band 27 (1), 2007, p. 51–58. Prague, June 2007. Association p. 47–69. for Computational Linguistics. [22] M. Scaiano, D. Inkpen. Getting More from [10] M Georgescul, A Clark, and S Armstrong. An Segmentation Evaluation. 2012 Conference of analysis of quantitative aspects in the evaluation the North American Chapter of the Association of thematic segmentation algorithms. for Computational Linguistics: Human Language SigDIAL’06 Proceedings of the 7th SIGdial Technologies, p. 362–366, Montreqal, Canada, Workshop on Discourse and Dialogue, Jan. June 3-8, 2012. 2009. [23] Stark, Heather A., 1988. What do paragraph [11] Marti A. Hearst. TextTiling: Segmenting text markings do? Discourse Processes 11, p. 275– into multi-paragraph subtopic passages. 303. Computational Linguistics, 23(1), p. 33–64, [24] Xiaojun Wan. On the effectiveness of subwords March 1997. for lexical cohesion based story segmentation of [12] Marti A. Hearst and Christian Plaunt. 1993. Chinese broadcast news. Information Sciences Subtopic structuring for full-length document 177 (2007), p. 3718–3730. 103 [25] Na Ye, Jingbo Zhu, Huizhen Wang, Matthew Y. $ #$"0 ' « '- Ma, Bin Zhang. An Improved Model of 2012», , 30 # – 3 0 2012 . Dotplotting for Text Segmentation. Journal of http://www.dialog- Chinese Language and Computing 17 (1), p. 27- 21.ru/digest/2012/?type=doc 40. [26] ! .., !# .., . . [30] 0. ' ( ( . # $--1 $ $), .: !-# . 2010. 3 -' -# (- [31] #'" .!. $) + (# , ( &#% // - (( # ' ( " ' # +) M. Black “Metaphor”) // , . # . C +1 & '' . – 2013. )$. 4(24). – 2012. 1 (77). . 128-134 . 140–150. [27] !3 !.., !# .., . ., [32] .. & #. C +) '" ..-' #-' ( : '% .– .: ' , , 2004. +-$% % '" ), [33] .!. +. #$"0 # $-,- $'"+ # # + +). : +-- .-. - %$' // SWorld. 2013. , 2004 . 8. 1. . 81-93 [28] .. '"3 -. !# ( # +) Specifics of Applying Topic Segmentation #$"0 ' . .: 6. 2011. [29] .!. , .. . # - Algorithms to Scientific Texts ( ' + SemSin // K. Boyarsky, N. Gusarova, N. Dobrenko, E. Kanevsky, 1-- &% N Avdeeva This paper considers how to apply topic segmentation algorithms to real scientific texts. To study it we used monographs on the same subject written in three languages. The corpus includes several fragments both in the original and in professional translation. The research is based on the TextTiling algorithm that analyses how tightly adjoining parts of a text cohere. We examined how some parameters (the cutoff rate, the size of moving window and of the shift from one block to the next one) influence the segmentation quality. The optimum combinations of these parameters are defined for several languages. The studies on the Russian language argue that external lexical resources (stop-lists, classifiers, ontologies) notably upgrade the quality of segmentation. 104