Miloš StanojevićLecturer at University College London
|
2024
@inproceedings{hale-stanojevic-2024-llms, title = "Do {LLM}s learn a true syntactic universal?", author = "Hale, John T. and Stanojevi{\'c}, Milo{\v{s}}", editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung", booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2024", address = "Miami, Florida, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.emnlp-main.950", pages = "17106--17119", abstract = "Do large multilingual language models learn language universals? We consider a candidate universal much-discussed in the linguistics literature, the Final-over-Final Condition (Sheehan et al., 2017b). This Condition is syntactic in the sense that it can only be stated by reference to abstract sentence properties such as nested phrases and head direction. A study of typologically diverse {``}mixed head direction{''} languages confirms that the Condition holds in corpora. But in a targeted syntactic evaluation, Gemini Pro only seems to respect the Condition in German, Russian, Hungarian and Serbian. These relatively high-resource languages contrast with Basque, where Gemini Pro does not seem to have learned the Condition at all. This result suggests that modern language models may need additional sources of bias in order to become truly human-like, within a developmentally-realistic budget of training data.", }
Do large multilingual language models learn language universals? We consider a candidate universal much-discussed in the linguistics literature, the Final-over-Final Condition (Sheehan et al., 2017b). This Condition is syntactic in the sense that it can only be stated by reference to abstract sentence properties such as nested phrases and head direction. A study of typologically diverse "mixed head direction" languages confirms that the Condition holds in corpora. But in a targeted syntactic evaluation, Gemini Pro only seems to respect the Condition in German, Russian, Hungarian and Serbian. These relatively high-resource languages contrast with Basque, where Gemini Pro does not seem to have learned the Condition at all. This result suggests that modern language models may need additional sources of bias in order to become truly human-like, within a developmentally-realistic budget of training data.
@inproceedings{franzluebbers-etal-2024-multipath, title = "Multipath parsing in the brain", author = "Franzluebbers, Berta and Dunagan, Donald and Stanojevi{\'c}, Milo{\v{s}} and Buys, Jan and Hale, John", editor = "Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek", booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", month = aug, year = "2024", address = "Bangkok, Thailand", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.acl-long.660", pages = "12215--12229", abstract = "Humans understand sentences word-by-word, in the order that they hear them. This incrementality entails resolving temporary ambiguities about syntactic relationships. We investigate how humans process these syntactic ambiguities by correlating predictions from incremental generative dependency parsers with timecourse data from people undergoing functional neuroimaging while listening to an audiobook. In particular, we compare competing hypotheses regarding the number of developing syntactic analyses in play during word-by-word comprehension: one vs more than one. This comparison involves evaluating syntactic surprisal from a state-of-the-art dependency parser with LLM-adapted encodings against an existing fMRI dataset. In both English and Chinese data, we find evidence for multipath parsing. Brain regions associated with this multipath effect include bilateral superior temporal gyrus.", }
Humans understand sentences word-by-word, in the order that they hear them. This incrementality entails resolving temporary ambiguities about syntactic relationships. We investigate how humans process these syntactic ambiguities by correlating predictions from incremental generative dependency parsers with timecourse data from people undergoing functional neuroimaging while listening to an audiobook. In particular, we compare competing hypotheses regarding the number of developing syntactic analyses in play during word-by-word comprehension: one vs more than one. This comparison involves evaluating syntactic surprisal from a state-of-the-art dependency parser with LLM-adapted encodings against an existing fMRI dataset. In both English and Chinese data, we find evidence for multipath parsing. Brain regions associated with this multipath effect include bilateral superior temporal gyrus.
@inproceedings{wang-etal-2024-large-language-models, title = "How can large language models become more human?", author = "Wang, Daphne and Sadrzadeh, Mehrnoosh and Stanojevi{\'c}, Milo{\v{s}} and Chow, Wing-Yee and Breheny, Richard", editor = "Kuribayashi, Tatsuki and Rambelli, Giulia and Takmaz, Ece and Wicke, Philipp and Oseki, Yohei", booktitle = "Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics", month = aug, year = "2024", address = "Bangkok, Thailand", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.cmcl-1.14", pages = "166--176", abstract = "Psycholinguistic experiments reveal that efficiency of human language use is founded on predictions at both syntactic and lexical levels. Previous models of human prediction exploiting LLMs have used an information theoretic measure called \textit{surprisal}, with success on naturalistic text in a wide variety of languages, but under-performance on challenging text such as garden path sentences. This paper introduces a novel framework that combines the lexical predictions of an LLM with the syntactic structures provided by a dependency parser. The framework gives rise to an \textit{Incompatibility Fraction}. When tested on two garden path datasets, it correlated well with human reading times, distinguished between easy and hard garden path, and outperformed surprisal.", }
Psycholinguistic experiments reveal that efficiency of human language use is founded on predictions at both syntactic and lexical levels. Previous models of human prediction exploiting LLMs have used an information theoretic measure called surprisal, with success on naturalistic text in a wide variety of languages, but under-performance on challenging text such as garden path sentences. This paper introduces a novel framework that combines the lexical predictions of an LLM with the syntactic structures provided by a dependency parser. The framework gives rise to an Incompatibility Fraction. When tested on two garden path datasets, it correlated well with human reading times, distinguished between easy and hard garden path, and outperformed surprisal.
2023
@article{ccg:brain:cognitive:science, author = {Stanojevi\'{c}, Milo\v{s} and Brennan, Jonathan R. and Dunagan, Donald and Steedman, Mark and Hale, John T.}, title = "{Modeling Structure-Building in the Brain With CCG Parsing and Large Language Models}", journal = {Cognitive Science}, volume = {47}, number = {7}, pages = {e13312}, keywords = {Syntax, Parsing, Grammar, fMRI, Neural networks, Language modeling, Surprisal}, doi = {https://doi.org/10.1111/cogs.13312}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cogs.13312}, eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/cogs.13312}, year = {2023} }
To model behavioral and neural correlates of language comprehension in naturalistic environments, researchers have turned to broad-coverage tools from natural-language processing and machine learning. Where syntactic structure is explicitly modeled, prior work has relied predominantly on context-free grammars (CFG), yet such formalisms are not sufficiently expressive for human languages. Combinatory Categorial Grammars (CCGs) are sufficiently expressive directly compositional models of grammar with flexible constituency that affords incremental interpretation. In this work we evaluate whether a more expressive CCG provides a better model than a CFG for human neural signals collected with fMRI while participants listen to an audiobook story. We further test between variants of CCG that differ in how they handle optional adjuncts. These evaluations are carried out against a baseline that includes estimates of next-word predictability from a Transformer neural network language model. Such a comparison reveals unique contributions of CCG structure-building predominantly in the left posterior temporal lobe: CCG-derived measures offer a superior fit to neural signals compared to those derived from a CFG. These effects are spatially distinct from bilateral superior temporal effects that are unique to predictability. Neural effects for structure-building are thus separable from predictability during naturalistic listening, and those effects are best characterized by a grammar whose expressive power is motivated on independent linguistic grounds.
@inproceedings{synjax2023, title = "{S}yn{J}ax: Structured Probability Distributions for {JAX}", author = "Stanojevi{\'c}, Milo{\v{s}} and Sartran, Laurent", editor = "Feng, Yansong and Lefever, Els", booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", month = dec, year = "2023", address = "Singapore", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.emnlp-demo.32", doi = "10.18653/v1/2023.emnlp-demo.32", pages = "353--364", }
The development of deep learning software libraries enabled significant progress in the field by allowing users to focus on modeling, while letting the library to take care of the tedious and time-consuming task of optimizing execution for modern hardware accelerators. However, this has benefited only particular types of deep learning models, such as Transformers, whose primitives map easily to the vectorized computation. The models that explicitly account for structured objects, such as trees and segmentations, did not benefit equally because they require custom algorithms that are difficult to implement in a vectorized form. SynJax directly addresses this problem by providing an efficient vectorized implementation of inference algorithms for structured distributions covering alignment, tagging, segmentation, constituency trees and spanning trees. This is done by exploiting the connection between algorithms for automatic differentiation and probabilistic inference. With SynJax we can build large-scale differentiable models that explicitly model structure in the data. The code is available at https://github.com/google-deepmind/synjax.
@article{dunagan2023, author = {Dunagan, Donald and Stanojevi\'{c}, Milo\v{s} and Coavoux, Maximin and Zhang, Shulin and Bhattasali, Shohini and Li, Jixing and Brennan, Jonathan and Hale, John}, title = "{Neural correlates of object-extracted relative clause processing across English and Chinese}", journal = {Neurobiology of Language}, pages = {1-43}, year = {2023}, month = {05}, abstract = "{Are the brain bases of language comprehension the same across all human languages, or do these bases vary in a way that corresponds to differences in linguistic typology? English and Mandarin Chinese attest such a typological difference in the domain of relative clauses. Using fMRI with English and Chinese participants, who listened to the same translation-equivalent story, we analyzed neuroimages time-aligned to object-extracted relative clauses in both languages. In a GLM analysis of these naturalistic data, comprehension was selectively associated with increased hemodynamic activity in left posterior temporal lobe, angular gyrus, inferior frontal gyrus, precuneus, and posterior cingulate cortex in both languages. This result suggests the processing of object-extracted relative clauses is subserved by a common collection of brain regions, regardless of typology. However, there were also regions that were activated uniquely in our Chinese participants albeit not to a significantly greater degree. These were in the temporal lobe. These Chinese-specific results could reflect structural ambiguity-resolution work that must be done in Chinese but not English ORCs.}", issn = {2641-4368}, doi = {10.1162/nol_a_00110}, url = {https://doi.org/10.1162/nol\_a\_00110}, eprint = {https://direct.mit.edu/nol/article-pdf/doi/10.1162/nol\_a\_00110/2112616/nol\_a\_00110.pdf}, }
Words can occur arbitrarily far away from where they contribute their meaning in a sentence. Two examples are WH-questions (WHQs), which begin with a WH-word like what and object-extracted relative clauses (ORCs), in which a noun is modified by a sentence-like grammatical unit. While these long-distance dependencies have been extensively studied, never before have their brain bases been examined from a multi-lingual, naturalistic perspective. This study fills this gap by analyzing WHQs and ORCs in fMRI data collected while 35 Chinese participants (15 females) and 49 English participants (30 females) listen to translation-equivalent stories. These languages exhibit radical typological differences in word order in these constructions. It remains unknown whether the brain basis for comprehension in these languages is similar or different. Separate general linear model analyses were performed and voxel-level intersections were calculated between the results to identify common regions of selectively increased activation during the comprehension of these linguistic constructions. Further Bayesian region of interest analyses probed whether common increases were truly similar. We found remarkable cross-linguistic commonality for both constructions. WHQs were associated with increased activation in the left middle and superior temporal lobe, left temporoparietal junction, left inferior frontal gyrus, and bilateral medial frontal lobe. ORCs were associated with increased activation in the left middle temporal lobe, left inferior frontal gyrus, bilateral angular gyrus, bilateral posterior cingulate, bilateral precuneus, and left medial frontal lobe. These results support the hypothesis that, regardless of form, the brain bases of higher-level language processing are uniform across languages.
2022
@inproceedings{stanojevic-2022-unbiased, title = "Unbiased and Efficient Sampling of Dependency Trees", author = "Stanojevi{\'c}, Milo{\v{s}}", booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", month = dec, year = "2022", address = "Abu Dhabi, United Arab Emirates", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2022.emnlp-main.110", pages = "1691--1706", abstract = "Most computational models of dependency syntax consist of distributions over spanning trees. However, the majority of dependency treebanks require that every valid dependency tree has a single edge coming out of the ROOT node, a constraint that is not part of the definition of spanning trees. For this reason all standard inference algorithms for spanning trees are sub-optimal for inference over dependency trees.Zmigrod et al (2021) proposed algorithms for sampling with and without replacement from the dependency tree distribution that incorporate the single-root constraint. In this paper we show that their fastest algorithm for sampling with replacement, Wilson-RC, is in fact producing biased samples and we provide two alternatives that are unbiased. Additionally, we propose two algorithms (one incremental, one parallel) that reduce the asymptotic runtime of algorithm for sampling k trees without replacement to O(kn{\textasciicircum}3). These algorithms are both asymptotically and practically more efficient.", }
Distributions over spanning trees are the most common way of computational modeling of dependency syntax. However, most treebanks require that every valid dependency tree has a single edge coming out of the ROOT node, a constraint that is not part of the definition of spanning trees. For this reason all standard inference algorithms for spanning trees are sub-optimal for modeling dependency trees. Zmigrod et al. (2021b) have recently proposed algorithms for sampling with and without replacement from the single-root dependency tree distribution. In this paper we show that their fastest algorithm for sampling with replacement, Wilson-RC, is in fact producing biased samples and we provide two alternatives that are unbiased. Additionally, we propose two algorithms (one incremental, one parallel) that reduce the asymptotic runtime of their algorithm for sampling k trees without replacement to (kn3). These algorithms are both asymptotically and practically more efficient.
@article{sartran2022transformer, author = {Sartran, Laurent and Barrett, Samuel and Kuncoro, Adhiguna and Stanojevi\'{c}, Milo\v{s} and Blunsom, Phil and Dyer, Chris}, title = "{Transformer Grammars: Augmenting Transformer Language Models with Syntactic Inductive Biases at Scale}", journal = {Transactions of the Association for Computational Linguistics}, volume = {10}, pages = {1423-1439}, year = {2022}, month = {12}, issn = {2307-387X}, doi = {10.1162/tacl_a_00526}, url = {https://doi.org/10.1162/tacl\_a\_00526}, eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00526/2064617/tacl\_a\_00526.pdf}, }
Transformer language models that are trained on vast amounts of data have achieved remarkable success at various NLP benchmarks. Intriguingly, this success is achieved by models that lack an explicit modeling of hierarchical syntactic structures, which were hypothesized by decades of linguistic research to be necessary for good generalization. This naturally leaves a question: to what extent can we further improve the performance of Transformer language models, through an inductive bias that encourages the model to explain the data through the lens of recursive syntactic compositions? Although the benefits of modeling recursive syntax have been shown at the small data and model scales, it remains an open question whether—and to what extent—a similar design principle is still beneficial in the case of powerful Transformer language models that work well at scale. To answer these questions, we introduce Transformer Grammars—a novel class of Transformer language models that combine: (i) the expressive power, scalability, and strong performance of Transformers, and (ii) recursive syntactic compositions, which here are implemented through a special attention mask. We find that Transformer Grammars outperform various strong baselines on multiple syntax-sensitive language modeling evaluation metrics, in addition to sentence-level language modeling perplexity. Nevertheless, we find that the recursive syntactic composition bottleneck harms perplexity on document-level modeling, providing evidence that a different kind of memory mechanism—that works independently of syntactic structures—plays an important role in the processing of long-form text. We describe two approaches to single-root dependency parsing that yield significant speed ups in such parsing. One approach has been previously used in dependency parsers in practice, but remains undocumented in the parsing literature, and is considered a heuristic. We show that this approach actually finds the optimal dependency tree. The second approach relies on simple reweighting of the inference graph being input to the dependency parser and has an optimal running time. Here, we again show that this approach is fully correct and identifies the highest-scoring parse tree. Our experiments demonstrate a manyfold speed up compared to a previous graph-based state-of-the-art parser without any loss in accuracy or optimality.
2021
@article{10.1162/coli_a_00394, author = {Stanojevi\'{c}, Milo\v{s} and Steedman, Mark}, title = "{Formal Basis of a Language Universal}", journal = {Computational Linguistics}, volume = {47}, number = {1}, pages = {9-42}, year = {2021}, month = {04}, issn = {0891-2017}, doi = {10.1162/coli_a_00394}, url = {https://doi.org/10.1162/coli\_a\_00394}, eprint = {https://direct.mit.edu/coli/article-pdf/47/1/9/1911502/coli\_a\_00394.pdf}, }
Steedman (2020) proposes as a formal universal of natural language grammar that grammaticalpermutations of the kind that have given rise to transformational rules are limited to a class known to mathematicians and computer scientists as the “separable” permutations. This class of permutations is exactly the class that can be expressed in combinatory categorial grammars(CCG). The excluded non-separable permutations do in fact seem to be absent in a number of studies of cross-linguistic variation in word-order in nominal and verbal constructions.
The number of permutations that are separable grows in the number n of lexical elements in the construction as the Large Schröder Number Sn−1. Since that number grows much more slowly than the n!number of all permutations, this generalization is also of considerable practical interest for computational applications such as parsing and machine translation.
The present paper examines the mathematical and computational origins of this restriction,and the reason it is exactly captured in CCG without the imposition of any further constraints.
@inproceedings{stanojevic-cohen-2021-root, title = "A Root of a Problem: Optimizing Single-Root Dependency Parsing", author = "Stanojevi{\'c}, Milo{\v{s}} and Cohen, Shay B.", booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2021", address = "Online and Punta Cana, Dominican Republic", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.emnlp-main.823", pages = "10540--10557", abstract = "We describe two approaches to single-root dependency parsing that yield significant speed ups in such parsing. One approach has been previously used in dependency parsers in practice, but remains undocumented in the parsing literature, and is considered a heuristic. We show that this approach actually finds the optimal dependency tree. The second approach relies on simple reweighting of the inference graph being input to the dependency parser and has an optimal running time. Here, we again show that this approach is fully correct and identifies the highest-scoring parse tree. Our experiments demonstrate a manyfold speed up compared to a previous graph-based state-of-the-art parser without any loss in accuracy or optimality.", }
We describe two approaches to single-root dependency parsing that yield significant speed ups in such parsing. One approach has been previously used in dependency parsers in practice, but remains undocumented in the parsing literature, and is considered a heuristic. We show that this approach actually finds the optimal dependency tree. The second approach relies on simple reweighting of the inference graph being input to the dependency parser and has an optimal running time. Here, we again show that this approach is fully correct and identifies the highest-scoring parse tree. Our experiments demonstrate a manyfold speed up compared to a previous graph-based state-of-the-art parser without any loss in accuracy or optimality.
@inproceedings{cmcl:2021:ccg:brain, title = "{Modeling Incremental Language Comprehension in the Brain with {C}ombinatory {C}ategorial {G}rammar}", author = "Stanojevi{\'c}, Milo{\v{s}} and Bhattasali, Shohini and Dunagan, Donald and Campanelli, Luca and Steedman, Mark and Brennan, Jonathan and Hale, John", booktitle = "Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics", month = jun, year = "2021", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2021.cmcl-1.3", doi = "10.18653/v1/2021.cmcl-1.3", pages = "23--38", abstract = "Hierarchical sentence structure plays a role in word-by-word human sentence comprehension, but it remains unclear how best to characterize this structure and unknown how exactly it would be recognized in a step-by-step process model. With a view towards sharpening this picture, we model the time course of hemodynamic activity within the brain during an extended episode of naturalistic language comprehension using Combinatory Categorial Grammar (CCG). CCG has well-defined incremental parsing algorithms, surface compositional semantics, and can explain long-range dependencies as well as complicated cases of coordination. We find that CCG-derived predictors improve a regression model of fMRI time course in six language-relevant brain regions, over and above predictors derived from context-free phrase structure. Adding a special Revealing operator to CCG parsing, one designed to handle right-adjunction, improves the fit in three of these regions. This evidence for CCG from neuroimaging bolsters the more general case for mildly context-sensitive grammars in the cognitive science of language.", }
Hierarchical sentence structure plays a role in word-by-word human sentence comprehension, but it remains unclear how best to characterize this structure and unknown how exactly it would be recognized in a step-by-step process model. With a view towards sharpening this picture, we model the time course of hemodynamic activity within the brain during an extended episode of naturalistic language comprehension using Combinatory Categorial Grammar (CCG). CCG has well-defined incremental parsing algorithms, surface compositional semantics, and can explain long-range dependencies as well as complicated cases of coordination. We find that CCG-derived predictors improve a regression model of fMRI time course in six language-relevant brain regions, over and above predictors derived from context-free phrase structure. Adding a special Revealing operator to CCG parsing, one designed to handle right-adjunction, improves the fit in three of these regions. This evidence for CCG from neuroimaging bolsters the more general case for mildly context-sensitive grammars in the cognitive science of language.
@inproceedings{stanojevic:steedman:iwcs:2021, title = "Computing All Quantifier Scopes with CCG", author={Stanojevi{\'{c}}, Milo{\v{s}} and Steedman, Mark}, booktitle = "Proceedings of the 14th International Conference on Computational Semantics - Short Papers", year = "2021", address = "Groningen, Netherlands", publisher = "Association for Computational Linguistics", }
We present a method for computing all quantifer scopes that can be extracted from a single CCG derivation. To do that we build on the proposal of Steedman (1999, 2011) where all existential quantifiers are treated as Skolem functions. We extend the approach by introducing a better packed representation of all possible specifications that also includes node addresses where the specifications happen. These addresses are necessary for recovering all, and only, possible readings.
@inproceedings{bijl-de-vroe-etal-2021-modality, title = "Modality and Negation in Event Extraction", author = "Bijl de Vroe, Sander and Guillou, Liane and Stanojevi{\'c}, Milo{\v{s}} and McKenna, Nick and Steedman, Mark", booktitle = "Proceedings of the 4th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE 2021)", month = aug, year = "2021", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.case-1.6", doi = "10.18653/v1/2021.case-1.6", pages = "31--42", abstract = "Language provides speakers with a rich system of modality for expressing thoughts about events, without being committed to their actual occurrence. Modality is commonly used in the political news domain, where both actual and possible courses of events are discussed. NLP systems struggle with these semantic phenomena, often incorrectly extracting events which did not happen, which can lead to issues in downstream applications. We present an open-domain, lexicon-based event extraction system that captures various types of modality. This information is valuable for Question Answering, Knowledge Graph construction and Fact-checking tasks, and our evaluation shows that the system is sufficiently strong to be used in downstream applications.", }
Language provides speakers with a rich system of modality for expressing thoughts about events, without being committed to their actual occurrence. Modality is commonly used in the political news domain, where both actual and possible courses of events are discussed. NLP systems struggle with these semantic phenomena, often incorrectly extracting events which did not happen, which can lead to issues in downstream applications. We present an open-domain, lexicon-based event extraction system that captures various types of modality. This information is valuable for Question Answering, Knowledge Graph construction and Fact-checking tasks, and our evaluation shows that the system is sufficiently strong to be used in downstream applications.
2020
@inproceedings{stanojevic-steedman-2020-max, title = "{Max-Margin Incremental {CCG} Parsing}", author = "Stanojevi{\'c}, Milo{\v{s}} and Steedman, Mark", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.378", doi = "10.18653/v1/2020.acl-main.378", pages = "4111--4122", abstract = "Incremental syntactic parsing has been an active research area both for cognitive scientists trying to model human sentence processing and for NLP researchers attempting to combine incremental parsing with language modelling for ASR and MT. Most effort has been directed at designing the right transition mechanism, but less has been done to answer the question of what a probabilistic model for those transition parsers should look like. A very incremental transition mechanism of a recently proposed CCG parser when trained in straightforward locally normalised discriminative fashion produces very bad results on English CCGbank. We identify three biases as the causes of this problem: label bias, exposure bias and imbalanced probabilities bias. While known techniques for tackling these biases improve results, they still do not make the parser state of the art. Instead, we tackle all of these three biases at the same time using an improved version of beam search optimisation that minimises all beam search violations instead of minimising only the biggest violation. The new incremental parser gives better results than all previously published incremental CCG parsers, and outperforms even some widely used non-incremental CCG parsers.", }
Incremental syntactic parsing has been an active research area both for cognitive scientists trying to model human sentence processing and for NLP researchers attempting to combine incremental parsing with language modelling for ASR and MT. Most effort has been directed at designing the right transition mechanism, but less has been done to answer the question of what a probabilistic model for those transition parsers should look like. A very incremental transition mechanism of a recently proposed CCG parser when trained in straightforward locally normalised discriminative fashion produces very bad results on English CCGbank. We identify three biases as the causes of this problem: label bias, exposure bias and imbalanced probabilities bias. While known techniques for tackling these biases improve results, they still do not make the parser state of the art. Instead, we tackle all of these three biases at the same time using an improved version of beam search optimisation that minimises all beam search violations instead of minimising only the biggest violation. The new incremental parser gives better results than all previously published incremental CCG parsers, and outperforms even some widely used non-incremental CCG parsers.
@inproceedings{stanojevic-steedman-2020-span, title = "{Span-Based {LCFRS}-2 Parsing}", author = "Stanojevi{\'c}, Milo{\v{s}} and Steedman, Mark", booktitle = "Proceedings of the 16th International Conference on Parsing Technologies and the IWPT 2020 Shared Task on Parsing into Enhanced Universal Dependencies", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.iwpt-1.12", doi = "10.18653/v1/2020.iwpt-1.12", pages = "111--121", abstract = "The earliest models for discontinuous constituency parsers used mildly context-sensitive grammars, but the fashion has changed in recent years to grammar-less transition-based parsers that use strong neural probabilistic models to greedily predict transitions. We argue that grammar-based approaches still have something to contribute on top of what is offered by transition-based parsers. Concretely, by using a grammar formalism to restrict the space of possible trees we can use dynamic programming parsing algorithms for exact search for the most probable tree. Previous chart-based parsers for discontinuous formalisms used probabilistically weak generative models. We instead use a span-based discriminative neural model that preserves the dynamic programming properties of the chart parsers. Our parser does not use an explicit grammar, but it does use explicit grammar formalism constraints: we generate only trees that are within the LCFRS-2 formalism. These properties allow us to construct a new parsing algorithm that runs in lower worst-case time complexity of O(l n{\^{}}4 +n{\^{}}6), where $n$ is the sentence length and $l$ is the number of unique non-terminal labels. This parser is efficient in practice, provides best results among chart-based parsers, and is competitive with the best transition based parsers. We also show that the main bottleneck for further improvement in performance is in the restriction of fan-out to degree 2. We show that well-nestedness is helpful in speeding up parsing, but lowers accuracy.", }
The earliest models for discontinuous constituency parsers used mildly context-sensitive grammars, but the fashion has changed in recent years to grammar-less transition-based parsers that use strong neural probabilistic models to greedily predict transitions. We argue that grammar-based approaches still have something to contribute on top of what is offered by transition-based parsers. Concretely, by using a grammar formalism to restrict the space of possible trees we can use dynamic programming parsing algorithms for exact search for the most probable tree. Previous chart-based parsers for discontinuous formalisms used probabilistically weak generative models. We instead use a span-based discriminative neural model that preserves the dynamic programming properties of the chart parsers. Our parser does not use an explicit grammar, but it does use explicit grammar formalism constraints: we generate only trees that are within the LCFRS-2 formalism. These properties allow us to construct a new parsing algorithm that runs in lower worst-case time complexity of O(l n{\^{}}4 +n{\^{}}6), where $n$ is the sentence length and $l$ is the number of unique non-terminal labels. This parser is efficient in practice, provides best results among chart-based parsers, and is competitive with the best transition based parsers. We also show that the main bottleneck for further improvement in performance is in the restriction of fan-out to degree 2. We show that well-nestedness is helpful in speeding up parsing, but lowers accuracy.
@inproceedings{CUNY2020, title="{Predictive Processing of Coordination in CCG}", author="Milo\v{s} Stanojevi\'{c} and John Hale and Mark Steedman", booktitle = "Proceedings of the 33rd Annual {CUNY} Conference on Human Sentence Processing", year = "2020", address = "Amherst, Massachusetts", organization = "University of Massachusetts", url = "https://osf.io/2xjgn" }
Human sentence processing is highly incremental at all levels, including semantic interpretations. Right adjuncts (including right conjuncts) are interesting in this context because processing models often make adjunction a non-incremental operation. Sturt and Lombardo (S&L, 2005) have shown that a greater degree of incrementality seems to be needed in processing coordinations. S&L propose using the adjoin operation of Tree-Adjoining Grammar (TAG) to explain this incrementality. Here we argue that the operations of tree rotation and revealing from Stanojević and Steedman (2019) are providing a simpler explanation of the results of S&L within the Combinatory Categorial Grammar (CCG) formalism.
2019
@inproceedings{torr-etal-2019-wide, title = "{Wide-Coverage Neural {A}* Parsing for {M}inimalist {G}rammars}", author = "Torr, John and Stanojevi\'{c}, Milo\v{s} and Steedman, Mark and Cohen, Shay B.", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P19-1238", doi = "10.18653/v1/P19-1238", pages = "2486--2505", abstract = "Minimalist Grammars (Stabler, 1997) are a computationally oriented, and rigorous formalisation of many aspects of Chomsky{'}s (1995) Minimalist Program. This paper presents the first ever application of this formalism to the task of realistic wide-coverage parsing. The parser uses a linguistically expressive yet highly constrained grammar, together with an adaptation of the A* search algorithm currently used in CCG parsing (Lewis and Steedman, 2014; Lewis et al., 2016), with supertag probabilities provided by a bi-LSTM neural network supertagger trained on MGbank, a corpus of MG derivation trees. We report on some promising initial experimental results for overall dependency recovery as well as on the recovery of certain unbounded long distance dependencies. Finally, although like other MG parsers, ours has a high order polynomial worst case time complexity, we show that in practice its expected time complexity is cubic in the length of the sentence. The parser is publicly available.", }
Minimalist Grammars (Stabler, 1997) are a computationally oriented, and rigorous formalisation of many aspects of Chomsky’s(1995) Minimalist Program. This paper presents the first ever application of this formalism to the task of realistic wide-coverage parsing. The parser uses a linguistically expressive yet highly constrained grammar, together with an adaptation of the A* search algorithm currently used in CCG parsing (Lewis and Steedman, 2014; Lewis et al., 2016), with supertag probabilities provided by a bi-LSTM neural network supertagger trained on MG-bank, a corpus of MG derivation trees. We report on some promising initial experimental results for overall dependency recovery as well as on the recovery of certain unbounded long distance dependencies. Finally, although like other MG parsers, ours has a high order polynomial worst case time complexity, we show that in practice its expected time complexity is O(n^3). The parser is publicly available.
@inproceedings{FG2019:MG, author="Stanojevi{\'{c}}, Milo{\v{s}}", editor="Bernardi, Raffaella and Kobele, Greg and Pogodalla, Sylvain", title="{On the Computational Complexity of Head Movement and Affix Hopping}", booktitle="Formal Grammar", year="2019", publisher="Springer Berlin Heidelberg", address="Berlin, Heidelberg", pages="101--116", isbn="978-3-662-59648-7" }
Head movement is a syntactic operation used in most generative syntactic analyses. However, its computational properties have not been extensively studied. Stabler (2001) formalises head movementin the framework of Minimalist Grammars by extending the item representation to allow for easy extraction of the head. This work shows that Stabler’s representation is in fact suboptimal because it causes higher polynomial parsing complexity. A new algorithm is derived for parsing head movement and affix hopping by changing the kinds of representations that the parser deals with. This algorithm has much better asymptotic worst-case runtime of O(n^{2k+5}). This result makes parsing head movement and affix hopping computationally as efficient as parsing a single phrase movement.
@inproceedings{stanojevic-steedman-2019-ccg, title = "{{CCG} Parsing Algorithm with Incremental Tree Rotation}", author = "Stanojevi{\'c}, Milo{\v{s}} and Steedman, Mark", booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", month = jun, year = "2019", address = "Minneapolis, Minnesota", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N19-1020", pages = "228--239", }
The main obstacle to incremental sentence processing arises from right-branching constituent structures, which are present in the majority of English sentences, as well as optional constituents that adjoin on the right, such as right adjuncts and right conjuncts. In CCG, many right-branching derivations can be replaced by semantically equivalent left-branching incremental derivations. The problem of right-adjunction is more resistant to solution, and has been tackled in the past using revealing-based approaches that often rely either on the higher-order unification over lambda terms (Pareschi and Steedman,1987) or heuristics over dependency representations that do not cover the whole CCGbank (Ambati et al., 2015). We propose a new incremental parsing algorithm for CCG following the same revealing tradition of work but having a purely syntactic approach that does not depend on access to a distinct level of semantic representation. This algorithm can cover the whole CCGbank, with greater incrementality and accuracy than previous proposals.
@inproceedings{corner2019, title = "The Active-Filler Strategy in a Move-Eager Left-Corner Minimalist Grammar Parser", author = "Hunter, Tim and Stanojevi{\'c}, Milo{\v{s}} and Stabler, Edward", booktitle = "Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics", month = jun, year = "2019", address = "Minneapolis, Minnesota", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W19-2901", pages = "1--10", }
Recent psycholinguistic evidence suggests that human parsing of moved elements is `active', and perhaps even `hyper-active': it seems that a leftward-moved object is related to a verbal position rapidly, perhaps even before the transitivity information associated with the verb is available to the listener. This paper presents a formal, sound and complete parser for Minimalist Grammars whose search space contains branching points that we can identify as the locus of the decision to perform this kind of active gap-finding. This brings formal models of parsing into closer contact with recent psycholinguistic theorizing than was previously possible.
2018
@InProceedings{stanojevic:stabler:2018:cogacll, author = "Stanojevi{\'{c}}, Milo{\v{s}} and Stabler, Edward", title = "{A Sound and Complete Left-Corner Parsing for Minimalist Grammars}", booktitle = "Proceedings of the Eight Workshop on Cognitive Aspects of Computational Language Learning and Processing", year = "2018", publisher = "Association for Computational Linguistics", pages = "65--74", location = "Melbourne", url = "http://aclweb.org/anthology/W18-2809" }
This paper presents a left-corner parser for minimalist grammars. The relation between the parser and the grammar is transparent in the sense that there is a very simple 1-1 correspondence between derivations and parses. Like left-corner context-free parsers, left-corner minimalist parsers can be non-terminating when the grammar has empty left corners, so an easily computed left-corner oracle is defined to restrict the search.
@inproceedings{real2018sick, title="{SICK-BR: a Portuguese corpus for inference}", author={Real, Livy and Rodrigues, Ana and e Silva, Andressa Vieira and Albiero, Beatriz and Thalenberg, Bruna and Guide, Bruno and Silva, Cindy and de Oliveira Lima, Guilherme and C{\^a}mara, Igor CS and Stanojevi{\'c}, Milo{\v{s}} and others}, booktitle="{International Conference on Computational Processing of the Portuguese Language}", pages={303--312}, year={2018}, organization={Springer} }
We describe SICK-BR, a Brazilian Portuguese corpus annotated with inference relations and semantic relatedness between pairs of sentences. SICK-BR is a translation and adaptation of the original SICK, a corpus of English sentences used in several semantic evaluations. SICK-BR consists of around 10k sentence pairs annotated for neutral/contradiction/entailment relations and for semantic relatedness, using a 5 point scale. Here we describe the strategies used for the adaptation of SICK, which preserve its original inference and relatedness relation labels in the SICK-BR Portuguese version. We also discuss some issues with the original corpus and how we might deal with them.
2017
@phdthesis{stanojevic:2017:thesis, author = {Milo\v{s} Stanojevi\'{c}}, title = "{Permutation Forests for Modeling Word Order in Machine Translation}", year = {2017}, school={University of Amsterdam}, }
In natural language, there is only a limited space for variation in the word order of linguistic productions. From a linguistic perspective, word order is the result of multiple application of syntactic recursive functions. These syntactic operations produce hierarchical syntactic structures, as well as a string of words that appear in a certain order. However, different languages are governed by different syntactic rules. Thus, one of the main problems in machine translation is to find the mapping between word order in the source language and word order in the target language. This is often done by a method of syntactic transfer, in which the syntactic tree is recovered from the source sentence, and then transduced so that its form is consistent with the syntactic rules of the target language.
In this dissertation, I propose an alternative to syntactic transfer that maintains its good properties---namely the compositional and hierarchical structure---but, unlike syntactic transfer, it is directly derived from data without requiring any linguistic annotation. This approach brings two main advantages. First, it allows for applying hierarchical reordering even on languages for which there are no syntactic parsers available. Second, unlike the trees used in syntactic transfer, which in some cases cannot cover the reordering patterns present in the data, the trees used in this work are built directly over the reordering patterns, so they can cover them by definition.
I treat reordering as a problem of predicting the permutation of the source words which permutes them into an order that is as close as possible to the target side order. This permutation can be recursively decomposed into a hierarchical structure called a permutation tree (PET) (Zhang and Gildea, 2007). In some cases there can be many permutation trees that can generate the same permutation. This set of permutation trees is called permutation forest. A permutation forest is a richer representation of a permutation because it covers all possible segmentations consistent with the permutation, so modeling permutations over the whole forest is a more promising approach than modeling a single tree. I apply permutation trees in two sub-tasks of machine translation: word order prediction and word order evaluation. In the word order prediction scenario I propose a probabilistic model that treats both the non-terminals and the bracketing of the sentence as latent variables. In the context of MT evaluation, I propose evaluation metrics that incorporate PETs and use machine learning methods to approximate human judgment of translation quality. Overall, the permutation tree models proposed here are (i) compositional, (ii) hierarchical and (iii) directly derived from unannotated translation data. Empirically, the models satisfying these three properties have been shown to improve translation quality, and provide better correlation with human judgment when used for evaluation of machine translation output.
@inproceedings{stanojevic:alhama:2017:emnlp, author = {Stanojevi\'{c}, Milo\v{s} and G. Alhama, Raquel}, title = "{Neural Discontinuous Constituency Parsing}", booktitle = "{Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, {EMNLP} 2017, Copenhagen, Denmark, September 9-11, 2017}", year = {2017}, }
One of the most pressing issues in discontinuous constituency transition-based parsing is that the relevant information for parsing decisions could be located in any part of the stack or the buffer. In this paper, we propose a solution to this problem by replacing the structured perceptron model with a recursive neural model that computes a global representation of the configuration, therefore allowing even the most remote parts of the configuration to influence the parsing decisions. We also provide a detailed analysis of how this representation should be built out of sub-representations of its core elements (words, trees and stack). Additionally, we investigate how different types of swap oracles influence the results. Our model is the first neural discontinuous constituency parser, and it outperforms all the previously published models on three out of four datasets while on the fourth it obtains second place by a tiny difference.
@InProceedings{stanojevic-simaan:2017:Short, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, title = "{Alternative Objective Functions for Training MT Evaluation Metrics}", booktitle = "{Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}", month = {July}, year = {2017}, address = {Vancouver, Canada}, publisher = {Association for Computational Linguistics}, pages = {20--25}, abstract = {MT evaluation metrics are tested for correlation with human judgments either at the sentence- or the corpus-level. Trained metrics ignore corpus-level judgments and are trained for high sentence-level correlation only. We show that training only for one objective (sentence or corpus level), can not only harm the performance on the other objective, but it can also be suboptimal for the objective being optimized. To this end we present a metric trained for corpus-level and show empirical comparison against a metric trained for sentence-level exemplifying how their performance may vary per language pair, type and level of judgment. Subsequently we propose a model trained to optimize both objectives simultaneously and show that it is far more stable than--and on average outperforms--both models on both objectives.}, url = {http://aclweb.org/anthology/P17-2004} }
MT evaluation metrics are tested for correlation with human judgments either at the sentence- or the corpus-level. Trained metrics ignore corpus-level judgments and are trained for high sentence-level correlation only. We show that training only for one objective (sentence or corpus level), can not only harm the performance on the other objective, but it can also be suboptimal for the objective being optimized. To this end we present a metric trained for corpus-level and show empirical comparison against a metric trained for sentence level exemplifying how their performance may vary per language pair, type and level of judgment. Subsequently we propose a model trained to optimize both objectives simultaneously and show that it is far more stable than–and on average outperforms– both models on both objectives.
2016
@INBOOK{LACLStanojevic2016, pages = {273--290}, title = "{Minimalist Grammar Transition-Based Parsing}", publisher = {Springer Berlin Heidelberg}, year = {2016}, editor = {Amblard, Maxime and de Groote, Philippe and Pogodalla, Sylvain and Retor{\'e}, Christian}, author = {Stanojevi{\'{c}}, Milo{\v{s}}}, address = {Berlin, Heidelberg}, booktitle = "{Logical Aspects of Computational Linguistics. Celebrating 20 Years of LACL (1996--2016): 9th International Conference, LACL 2016, Nancy, France, December 5-7, 2016, Proceedings}", doi = {10.1007/978-3-662-53826-5_17}, isbn = {978-3-662-53826-5}, url = {http://dx.doi.org/10.1007/978-3-662-53826-5_17} }
Current chart-based parsers of Minimalist Grammars exhibit prohibitively high polynomial complexity that makes them unusable in practice. This paper presents a transition-based parser for Minimalist Grammars that approximately searches through the space of possible derivations by means of beam search, and does so very efficiently: the worst case complexity of building one derivation is O(n^2 ) and the best case complexity is O(n). This approximated inference can be guided by a trained probabilistic model that can condition on larger context than standard chart-based parsers. The transitions of the parser are very similar to the transitions of bottom-up shift-reduce parsers for Context-Free Grammars, with additional transitions for online reordering of words during parsing in order to make non-projective derivations projective.
@inproceedings{stanojevic-simaan:2016:COLING, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, year="2016", title = "{Hierarchical Permutation Complexity for Word Order Evaluation}", booktitle = {Proceedings of the 26th International Conference on Computational Linguistics (COLING-2016)}, address = {Osaka, Japan}, year = {2016}, month = {December}, }
Existing approaches for evaluating word order in machine translation work with metrics computed directly over a permutation of word positions in system output relative to a reference translation. However, every permutation factorizes into a permutation tree (PET) built of primal permutations, i.e., atomic units that do not factorize any further. In this paper we explore the idea that permutations factorizing into (on average) shorter primal permutations should represent simpler ordering as well. Consequently, we contribute Permutation Complexity, a class of metrics over PETs and their extension to forests, and define tight metrics, a sub-class of metrics implementing this idea. Subsequently we define example tight metrics and empirically test them in word order evaluation. Experiments on the WMT13 data sets for ten language pairs show that a tight metric is more often than not better than the baselines.
@inproceedings{daiber-stanojevic-simaan:2016:COLING, author = {Daiber, Joachim and Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, year="2016", title = "{Universal Reordering via Linguistic Typology}", booktitle = "{Proceedings of the 26th International Conference on Computational Linguistics (COLING-2016)}", address = {Osaka, Japan}, year = {2016}, month = {December}, }
In this paper we explore the novel idea of building a single universal reordering model from English to a large number of target languages. To build this model we exploit typological features of word order for a large number of target languages together with source (English) syntactic features and we train this model on a single combined parallel corpus representing all (22) involved language pairs. We contribute experimental evidence for the usefulness of linguistically defined typological features for building such a model. When the universal reordering model is used for preordering followed by monotone translation (no reordering inside the decoder), our experiments show that this pipeline gives comparable or improved translation performance with a phrase-based baseline for a large number of language pairs (12 out of 22) from diverse language families.
@InProceedings{daiber-EtAl:2016:WMT, author = {Daiber, Joachim and Stanojevi\'{c}, Milo\v{s} and Aziz, Wilker and Sima'an, Khalil}, title = "{Examining the Relationship between Preordering and Word Order Freedom in Machine Translation}", booktitle = "{Proceedings of the First Conference on Machine Translation}", month = {August}, year = {2016}, address = {Berlin, Germany}, publisher = {Association for Computational Linguistics}, pages = {118--130}, url = {http://www.aclweb.org/anthology/W/W16/W16-2213} }
We study the relationship between word order freedom and preordering in statistical machine translation. To assess word order freedom, we first introduce a novel entropy measure which quantifies how difficult it is to predict word order given a source sentence and its syntactic analysis. We then address preordering for two target languages at the far ends of the word order freedom spectrum, German and Japanese, and argue that for languages with more word order freedom, attempting to predict a unique word order given source clues only is less justified. Subsequently, we examine lattices of n-best word order predictions as a unified representation for languages from across this broad spectrum and present an effective solution to a resulting technical issue, namely how to select a suitable source word order from the lattice during training. Our experiments show that lattices are crucial for good empirical performance for languages with freer word order (English–German) and can provide additional improvements for fixed word order languages (English–Japanese).
@InProceedings{bojar-EtAl:2016:WMT2, author = {Bojar, Ond\v{r}ej and Graham, Yvette and Kamran, Amir and Stanojevi\'{c}, Milo\v{s}}, title = "{Results of the WMT16 Metrics Shared Task}", booktitle = "{Proceedings of the First Conference on Machine Translation}", month = {August}, year = {2016}, address = {Berlin, Germany}, publisher = {Association for Computational Linguistics}, pages = {199--231}, url = {http://www.aclweb.org/anthology/W/W16/W16-2302} }
This paper presents the results of the WMT16 Metrics Shared Task. We asked participants of this task to score the outputs of the MT systems involved in the WMT16 Shared Translation Task. We collected scores of 16 metrics from 9 research groups. In addition to that, we computed scores of 9 standard metrics (BLEU, SentBLEU, NIST, WER, PER, TER and CDER) as baselines. The collected scores were evaluated in terms of system-level correlation (how well each metric’s scores correlate with WMT16 official manual ranking of systems) and in terms of segment level correlation (how often a metric agrees with humans in comparing two translations of a particular sentence). This year there are several additions to the setup: large number of language pairs (18 in total), datasets from different domains (news, IT and medical), and different kinds of judgments: relative ranking (RR), direct assessment (DA) and HUME manual semantic judgments. Finally, generation of large number of hybrid systems was trialed for provision of more conclusive system-level metric rankings.
@InProceedings{jawaid-EtAl:2016:WMT, author = {Jawaid, Bushra and Kamran, Amir and Stanojevi\'{c}, Milo\v{s} and Bojar, Ond\v{r}ej}, title = "{Results of the WMT16 Tuning Shared Task}", booktitle = "{Proceedings of the First Conference on Machine Translation}", month = {August}, year = {2016}, address = {Berlin, Germany}, publisher = {Association for Computational Linguistics}, pages = {232--238}, url = {http://www.aclweb.org/anthology/W/W16/W16-2303} }
This paper presents the results of the WMT16 Tuning Shared Task. We provided the participants of this task with a complete machine translation system and asked them to tune its internal parameters (feature weights). The tuned systems were used to translate the test set and the outputs were manually ranked for translation quality. We received 4 submissions in the Czech-English and 8 in the English-Czech translation direction. In addition, we ran 2 baseline setups, tuning the parameters with standard optimizers for BLEU score. In contrast to previous years, the tuned systems in 2016 rely on large data.
2015
@InProceedings{stanojevic-simaan:2015:EMNLP, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, title = "{Reordering Grammar Induction}", booktitle = "{Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing}", month = {September}, year = {2015}, address = {Lisbon, Portugal}, publisher = {Association for Computational Linguistics}, pages = {44--54}, url = {http://aclweb.org/anthology/D15-1005} }
We present a novel approach for unsupervised induction of a Reordering Gram- mar using a modified form of permutation trees (Zhang and Gildea, 2007), which we apply to preordering in phrase-based machine translation. Unlike previous approaches, we induce in one step both the hierarchical structure and the transduction function over it from word-aligned parallel corpora. Furthermore, our model (1) handles non-ITG reordering patterns (up to 5-ary branching), (2) is learned from all derivations by treating not only labeling but also bracketing as latent variable, (3) is entirely unlexicalized at the level of reordering rules, and (4) requires no linguis- tic annotation. Our model is evaluated both for accuracy in predicting target order, and for its impact on translation quality. We report sig- nificant performance gains over phrase reordering, and over two known preordering baselines for English-Japanese.
@InProceedings{stanojevic-simaan:2015:WMT, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, title = "{BEER 1.1: ILLC UvA submission to metrics and tuning task}", booktitle = "{Proceedings of the Tenth Workshop on Statistical Machine Translation}", month = {September}, year = {2015}, address = {Lisbon, Portugal}, publisher = {Association for Computational Linguistics}, pages = {396--401}, url = {http://aclweb.org/anthology/W15-3050} }
We describe the submissions of ILLC UvA to the metrics and tuning tasks on WMT15. Both submissions are based on the BEER evaluation metric originally presented on WMT14 (Stanojević and Sima’an, 2014a). The main changes introduced this year are: (i) extending the learning-to-rank trained sentence level metric to the corpus level (but still decomposable to sentence level), (ii) incorporating syntactic ingredients based on dependency trees, and (iii) a technique for finding parameters of BEER that avoid “gaming of the metric” during tuning.
@InProceedings{stanojevic-EtAl:2015:WMT, author = {Stanojevi\'{c}, Milo\v{s} and Kamran, Amir and Koehn, Philipp and Bojar, Ond\v{r}ej}, title = "{Results of the WMT15 Metrics Shared Task}", booktitle = {Proceedings of the Tenth Workshop on Statistical Machine Translation}, month = {September}, year = {2015}, address = {Lisbon, Portugal}, publisher = {Association for Computational Linguistics}, pages = {256--273}, url = {http://aclweb.org/anthology/W15-3031} }
This paper presents the results of the WMT15 Metrics Shared Task. We asked participants of this task to score the outputs of the MT systems involved in the WMT15 Shared Translation Task. We collected scores of 46 metrics from 11 research groups. In addition to that, we computed scores of 7 standard metrics (BLEU, SentBLEU, NIST, WER, PER, TER and CDER) as baselines. The collected scores were evaluated in terms of system level correlation (how well each metric’s scores correlate with WMT15 official manual ranking of systems) and in terms of segment level correlation (how often a metric agrees with humans in comparing two translations of a particular sentence).
@InProceedings{stanojevic-kamran-bojar:2015:WMT, author = {Stanojevi\'{c}, Milo\v{s} and Kamran, Amir and Bojar, Ond\v{r}ej}, title = "{Results of the WMT15 Tuning Shared Task}", booktitle = {Proceedings of the Tenth Workshop on Statistical Machine Translation}, month = {September}, year = {2015}, address = {Lisbon, Portugal}, publisher = {Association for Computational Linguistics}, pages = {274--281}, url = {http://aclweb.org/anthology/W15-3032} }
This paper presents the results of the WMT15 Tuning Shared Task. We provided the participants of this task with a complete machine translation system and asked them to tune its internal parameters (feature weights). The tuned systems were used to translate the test set and the outputs were manually ranked for translation quality. We received 4 submissions in the English-Czech and 6 in the Czech-English translation direction. In addition, we ran 3 baseline setups, tuning the parameters with standard optimizers for BLEU score.
@article{stanojevic-simaan:2015:PBML, title="{Evaluating MT systems with BEER}", author={Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, journal={The Prague Bulletin of Mathematical Linguistics}, volume={104}, pages={17--26}, year={2015} }
We present BEER, an open source implementation of a machine translation evaluation metric. BEER is a metric trained for high correlation with human ranking by using learning-to-rank training methods. For evaluation of lexical accuracy it uses sub-word units(character n-grams) while for measuring word order it uses hierarchical representations based on PETs (permutation trees). During the last WMT metrics tasks, BEER has shown high correlation with human judgments both on the sentence and the corpus levels. In this paper we will show how BEER can be used for (i) full evaluation of MT output, (ii) isolated evaluation of word order and (iii) tuning MT systems.
2014
@InProceedings{stanojevic-simaan:2014:EMNLP2014, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, title = "{Fitting Sentence Level Translation Evaluation with Many Dense Features}", booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, month = {October}, year = {2014}, address = {Doha, Qatar}, publisher = {Association for Computational Linguistics}, pages = {202--206}, url = {http://www.aclweb.org/anthology/D14-1025} }
Sentence level evaluation in MT has turned out far more difficult than corpus level evaluation. Existing sentence level metrics employ a limited set of features, most of which are rather sparse at the sentence level, and their intricate models are rarely trained for ranking. This paper presents a simple linear model exploiting 33 relatively dense features, some of which are novel while others are known but seldom used, and train it under the learning-to-rank framework. We evaluate our metric on the standard WMT12 data showing that it outperforms the strong baseline METEOR. We also analyze the contribution of individual features and the choice of training data, language-pair vs. target-language data, providing new insights into this task.
@InProceedings{stanojevic-simaan:2014:SSST-8, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, title = "{Evaluating Word Order Recursively over Permutation-Forests}", booktitle = {Proceedings of SSST-8, Eighth Workshop on Syntax, Semantics and Structure in Statistical Translation}, month = {October}, year = {2014}, address = {Doha, Qatar}, publisher = {Association for Computational Linguistics}, pages = {138--147}, url = {http://www.aclweb.org/anthology/W14-4017} }
Automatically evaluating word order of MT system output at the sentence-level is challenging. At the sentence-level, ngram counts are rather sparse which makes it difficult to measure word order quality effectively using lexicalized units. Recent approaches abstract away from lexicalization by assigning a score to the permutation representing how word positions in system output move around relative to a reference translation. Metrics over permutations exist (e.g., Kendal tau or Spearman Rho) and have been shown to be useful in earlier work. However, none of the existing metrics over permutations groups word positions recursively into larger phrase-like blocks, which makes it difficult to account for long-distance reordering phenomena. In this paper we explore novel metrics computed over Permutation Forests (PEFs), packed charts of Permutation Trees (PETs), which are tree decompositions of a permutation into primitive ordering units. We empirically compare PEFs metric against five known reordering metrics on WMT13 data for ten language pairs. The PEFs metric shows better correlation with human ranking than the other metrics almost on all language pairs. None of the other metrics exhibits as stable behavior across language pairs.
@InProceedings{stanojevic-simaan:2014:WMT, author = {Stanojevi\'{c}, Milo\v{s} and Sima'an, Khalil}, title = "{BEER: BEtter Evaluation as Ranking}", booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation}, month = {June}, year = {2014}, address = {Baltimore, Maryland, USA}, publisher = {Association for Computational Linguistics}, pages = {414--419}, url = {http://www.aclweb.org/anthology/W/W14/W14-3354} }
We present the UvA-ILLC submission of the BEER metric to WMT 14 metrics task. BEER is a sentence level metric that can incorporate a large number of features combined in a linear model. Novel contributions are (1) efficient tuning of a large number of features for maximizing correlation with human system ranking, and (2) novel features that give smoother sentence level scores.
2012
@INPROCEEDINGS{tamchyna2012selecting, author = {Tamchyna, Ale{\v{s}} and Galu{\v{s}}{\v{c}}{\'a}kov{\'a}, Petra and Kamran, Amir and Stanojevi{\'c}, Milo{\v{s}} and Bojar, Ond{\v{r}}ej}, title = "{Selecting data for English-to-Czech machine translation}", booktitle = {Proceedings of the Seventh Workshop on Statistical Machine Translation}, year = {2012}, pages = {374--381}, organization = {Association for Computational Linguistics} }
We provide a few insights on data selection for machine translation. We evaluate the quality of the new CzEng 1.0, a parallel data source used in WMT12. We describe a simple technique for reducing out-of-vocabulary rate after phrase extraction. We discuss the benefits of tuning towards multiple reference translations for English-Czech language pair. We introduce a novel approach to data selection by full-text indexing and search: we select sentences similar to the test set from a large monolingual corpus and explore several options of incorporating them in a machine translation system. We show that this method can improve translation quality. Finally, we describe our submitted system CU-TAMCH-BOJ.