minimalist_grammar_parser/
lib.rs

1//!This crate defines a number of structs and methods to parse and generate Minimalist Grammars
2//!(MGs) from Stabler (1997). Specifically, it implements a variety of the MG algorithm adapted
3//!from Stabler (2011) and Stabler (2013)
4//!
5//!
6//!# Examples of usage
7//!The following code generates 4 sentences from the $a^nb^n$ language.
8//!```
9//!use minimalist_grammar_parser::Lexicon;
10//!use minimalist_grammar_parser::ParsingConfig;
11//!# use minimalist_grammar_parser::lexicon::LexiconParsingError;
12//!let lexicon = Lexicon::from_string("a::s= b= s\nb::b\n::s")?;
13//!let v = lexicon
14//!    .generate("s", &ParsingConfig::default())?
15//!    .take(4)
16//!    .map(|(_prob, s, _rules)| {
17//!        s.into_iter()
18//!            .map(|word| word.try_inner().unwrap())
19//!            .collect::<Vec<_>>()
20//!            .join("")
21//!    })
22//!    .collect::<Vec<_>>();
23//!assert_eq!(v, vec!["", "ab", "aabb", "aaabbb"]);
24//!# Ok::<(), anyhow::Error>(())
25//!```
26//!## References
27//!
28//! - Stabler, E. (1997). Derivational minimalism. In C. Retoré (Ed.), Logical Aspects of Computational Linguistics (pp. 68–95). Springer. <https://doi.org/10.1007/BFb0052152>
29//! - Stabler, E. (2011). Top-Down Recognizers for MCFGs and MGs. In F. Keller & D. Reitter (Eds.), Proceedings of the 2nd Workshop on Cognitive Modeling and Computational Linguistics (pp. 39–48). Association for Computational Linguistics. <https://aclanthology.org/W11-0605>
30//! - Stabler, E. (2013). Two Models of Minimalist, Incremental Syntactic Analysis. Topics in Cognitive Science, 5(3), 611–633. <https://doi.org/10.1111/tops.12031>
31//!
32//!
33#![warn(missing_docs)]
34
35use std::borrow::Borrow;
36use std::fmt::{Debug, Display};
37use std::hash::Hash;
38use std::marker::PhantomData;
39
40#[cfg(not(target_arch = "wasm32"))]
41use std::time::{Duration, Instant};
42
43pub use lexicon::{Lexicon, ParsingError};
44
45use logprob::LogProb;
46use min_max_heap::MinMaxHeap;
47use parsing::RuleHolder;
48
49pub use parsing::RulePool;
50use parsing::beam::{FuzzyScan, GeneratorScan, ParseScan, Scanner};
51use parsing::{BeamWrapper, PartialRulePool, expand};
52use petgraph::graph::NodeIndex;
53
54use serde::{Deserialize, Serialize};
55use thiserror::Error;
56
57///The basic input type of the library used by generation and parsing a like
58#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
59pub enum PhonContent<T> {
60    ///Normal words
61    Normal(T),
62    ///Words that are built out of combining heads with head movement
63    Affixed(Vec<T>),
64}
65
66#[derive(Debug, Copy, Clone, PartialEq, Eq, Error)]
67///Error caused by flattening a sentence that has affixes in it.
68pub struct FlattenError {}
69impl Display for FlattenError {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        write!(f, "This Input is not a Normal variant")
72    }
73}
74
75impl<T> PhonContent<T> {
76    ///Try to flatten the output assuming that it is [`PhonContent::Normal`]
77    pub fn try_inner(self) -> Result<T, FlattenError> {
78        match self {
79            PhonContent::Normal(x) => Ok(x),
80            PhonContent::Affixed(_) => Err(FlattenError {}),
81        }
82    }
83
84    ///Create new input assuming there are no affixes.
85    pub fn new(x: Vec<T>) -> Vec<PhonContent<T>> {
86        x.into_iter().map(PhonContent::Normal).collect()
87    }
88
89    ///Create new input assuming there are no affixes
90    pub fn from<const N: usize>(x: [T; N]) -> [PhonContent<T>; N] {
91        x.map(PhonContent::Normal)
92    }
93
94    ///Try to flatten the output assuming that all members of the vector are
95    ///[`PhonContent::Normal`]
96    pub fn try_flatten(x: Vec<PhonContent<T>>) -> Result<Vec<T>, FlattenError> {
97        x.into_iter().map(|x| x.try_inner()).collect()
98    }
99}
100impl PhonContent<&str> {
101    ///Try to flatten the output and join all affixes without spaces
102    pub fn flatten(x: Vec<PhonContent<&str>>) -> Vec<String> {
103        let mut v = vec![];
104        for content in x.into_iter() {
105            match content {
106                PhonContent::Normal(val) => v.push(val.to_string()),
107                PhonContent::Affixed(items) => v.push(items.join("")),
108            }
109        }
110        v
111    }
112}
113
114///Enum to record the direction of a merge/move operation (whether the phonological value goes to
115///the right or left)
116#[allow(missing_docs)]
117#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
118pub enum Direction {
119    #[default]
120    Left,
121    Right,
122}
123
124impl Direction {
125    ///Swaps direction so that left is right and vice-versa
126    pub fn flip(&self) -> Self {
127        match self {
128            Direction::Left => Direction::Right,
129            Direction::Right => Direction::Left,
130        }
131    }
132}
133
134impl From<Direction> for bool {
135    fn from(value: Direction) -> Self {
136        match value {
137            Direction::Left => false,
138            Direction::Right => true,
139        }
140    }
141}
142
143impl From<bool> for Direction {
144    fn from(value: bool) -> Self {
145        match value {
146            false => Direction::Left,
147            true => Direction::Right,
148        }
149    }
150}
151
152///This struct defines the configuration used when parsing a [`Lexicon`].
153///
154///It has the following options:
155/// - `min_log_prob`: The lowest probability string that the parser will consider. (Default = -256)
156/// - `move_prob`: The probability of moving rather than merging when both are available (Default = log(0.5))
157/// - `max_steps`: The maximum number of derivational steps before crashing (Default = 256)
158/// - `max_beams`: The maximum number of competing parses available in the parse heap at a single time (Default = 256)
159/// - `max_time`: The maximum amount of *time* before the parse crashes (not available on `wasm32). Disabled by default
160#[derive(Debug, Copy, Clone, PartialEq, Eq)]
161pub struct ParsingConfig {
162    min_log_prob: Option<LogProb<f64>>,
163    move_prob: LogProb<f64>,
164    dont_move_prob: LogProb<f64>,
165    max_steps: Option<usize>,
166    max_beams: Option<usize>,
167    max_consecutive_empty: Option<usize>,
168
169    #[cfg(not(target_arch = "wasm32"))]
170    max_time: Option<Duration>,
171}
172
173impl ParsingConfig {
174    ///Create a new [`ParsingConfig`] with no limits on parsing and default move probability. Be careful to ensure when parsing
175    ///or generating with this config to avoid infinite loops (at the very least use
176    ///[`ParsingConfig::with_max_time`]).
177    pub fn empty() -> ParsingConfig {
178        let move_prob = LogProb::from_raw_prob(0.5).unwrap();
179        let dont_move_prob = move_prob.opposite_prob();
180
181        ParsingConfig {
182            min_log_prob: None,
183            move_prob,
184            dont_move_prob,
185            max_consecutive_empty: None,
186            max_steps: None,
187            max_beams: None,
188
189            #[cfg(not(target_arch = "wasm32"))]
190            max_time: None,
191        }
192    }
193
194    ///Create a new [`ParsingConfig`] with the following parameters
195    pub fn new(
196        min_log_prob: LogProb<f64>,
197        move_prob: LogProb<f64>,
198        max_steps: usize,
199        max_beams: usize,
200    ) -> ParsingConfig {
201        let max_steps = usize::min(parsing::MAX_STEPS, max_steps);
202        let merge_prob = move_prob.opposite_prob();
203        ParsingConfig {
204            min_log_prob: Some(min_log_prob),
205            move_prob,
206            dont_move_prob: merge_prob,
207            max_consecutive_empty: None,
208            max_steps: Some(max_steps),
209            max_beams: Some(max_beams),
210            #[cfg(not(target_arch = "wasm32"))]
211            max_time: None,
212        }
213    }
214
215    ///Set the maximum time before timing out a parse (not available on `wasm32`).
216    #[cfg(not(target_arch = "wasm32"))]
217    pub fn with_max_time(mut self, duration: Duration) -> Self {
218        self.max_time = Some(duration);
219        self
220    }
221
222    ///Set the maximum number of repeated empty heads.
223    pub fn with_max_consecutive_empty(mut self, n: usize) -> Self {
224        self.max_consecutive_empty = Some(n);
225        self
226    }
227
228    ///Set the minimum log probability for a parse.
229    pub fn with_min_log_prob(mut self, min_log_prob: LogProb<f64>) -> Self {
230        self.min_log_prob = Some(min_log_prob);
231        self
232    }
233
234    ///Set the maximum number of derivational steps for a parse.
235    pub fn with_max_steps(mut self, max_steps: usize) -> Self {
236        self.max_steps = Some(max_steps);
237        self
238    }
239
240    ///Set the maximum number of competing parses at a single time.
241    pub fn with_max_beams(mut self, max_beams: usize) -> Self {
242        self.max_beams = Some(max_beams);
243        self
244    }
245
246    ///Set the probability of moving as opposed to merging.
247    pub fn with_move_prob(mut self, move_prob: LogProb<f64>) -> Self {
248        self.move_prob = move_prob;
249        self.dont_move_prob = self.move_prob.opposite_prob();
250        self
251    }
252}
253
254impl Default for ParsingConfig {
255    fn default() -> Self {
256        ParsingConfig::new(
257            LogProb::new(-256.0).unwrap(),
258            LogProb::from_raw_prob(0.5).unwrap(),
259            128,
260            256,
261        )
262    }
263}
264
265#[derive(Debug, Clone)]
266struct ParseHeap<T, B: Scanner<T>> {
267    parse_heap: MinMaxHeap<BeamWrapper<T, B>>,
268    phantom: PhantomData<T>,
269    config: ParsingConfig,
270    rule_arena: Vec<RuleHolder>,
271}
272
273impl<T: Eq + std::fmt::Debug + Clone, B: Scanner<T> + Eq + Clone> ParseHeap<T, B> {
274    fn pop(&mut self) -> Option<BeamWrapper<T, B>> {
275        self.parse_heap.pop_max()
276    }
277
278    fn can_push(&self, v: &BeamWrapper<T, B>) -> bool {
279        let is_probable_enough = self
280            .config
281            .min_log_prob
282            .map(|p| v.log_prob() > p)
283            .unwrap_or(true);
284        let is_short_enough = self
285            .config
286            .max_steps
287            .map(|max_steps| v.n_steps() < max_steps)
288            .unwrap_or(true);
289        let is_not_fake_structure = self
290            .config
291            .max_consecutive_empty
292            .map(|n_empty| v.n_consecutive_empty() <= n_empty)
293            .unwrap_or(true);
294        is_short_enough && is_probable_enough && is_not_fake_structure
295    }
296
297    fn push(&mut self, v: BeamWrapper<T, B>) {
298        if self.can_push(&v) {
299            if let Some(max_beams) = self.config.max_beams
300                && self.parse_heap.len() > max_beams
301            {
302                self.parse_heap.push_pop_min(v);
303            } else {
304                self.parse_heap.push(v);
305            }
306        }
307    }
308
309    fn new(start: BeamWrapper<T, B>, config: &ParsingConfig, cat: NodeIndex) -> Self {
310        let mut parse_heap = MinMaxHeap::with_capacity(config.max_beams.unwrap_or(50));
311        parse_heap.push(start);
312        ParseHeap {
313            parse_heap,
314            phantom: PhantomData,
315            config: *config,
316            rule_arena: PartialRulePool::default_pool(cat),
317        }
318    }
319
320    fn rules_mut(&mut self) -> &mut Vec<RuleHolder> {
321        &mut self.rule_arena
322    }
323}
324
325type ParserOutput<'a, T> = (LogProb<f64>, &'a [PhonContent<T>], RulePool);
326type GeneratorOutput<T> = (LogProb<f64>, Vec<PhonContent<T>>, RulePool);
327
328///An iterator constructed by [`Lexicon::fuzzy_parse`]
329pub struct FuzzyParser<
330    'a,
331    'b,
332    T: Eq + std::fmt::Debug + Clone,
333    Category: Eq + Clone + std::fmt::Debug,
334> {
335    lexicon: &'a Lexicon<T, Category>,
336    parse_heap: ParseHeap<T, FuzzyScan<'b, T>>,
337    config: &'a ParsingConfig,
338}
339
340impl<T, Category> Iterator for FuzzyParser<'_, '_, T, Category>
341where
342    T: Eq + std::fmt::Debug + Clone,
343    Category: Eq + Clone + std::fmt::Debug + Hash,
344{
345    type Item = GeneratorOutput<T>;
346
347    fn next(&mut self) -> Option<Self::Item> {
348        while let Some(mut beam) = self.parse_heap.pop() {
349            if let Some(moment) = beam.pop_moment() {
350                expand(
351                    &mut self.parse_heap,
352                    moment,
353                    beam,
354                    self.lexicon,
355                    self.config,
356                );
357            } else if let Some(x) = FuzzyScan::yield_good_parse(beam, &self.parse_heap.rule_arena) {
358                return Some(x);
359            }
360        }
361
362        None
363    }
364}
365
366///An iterator constructed by [`Lexicon::parse`] and [`Lexicon::parse_multiple`]
367pub struct Parser<'a, 'b, T: Eq + std::fmt::Debug + Clone, Category: Eq + Clone + std::fmt::Debug> {
368    lexicon: &'a Lexicon<T, Category>,
369    parse_heap: ParseHeap<T, ParseScan<'b, T>>,
370
371    #[cfg(not(target_arch = "wasm32"))]
372    start_time: Option<Instant>,
373    config: &'a ParsingConfig,
374    buffer: Vec<ParserOutput<'b, T>>,
375}
376
377impl<'a, 'b, T, Category> Iterator for Parser<'a, 'b, T, Category>
378where
379    T: Eq + std::fmt::Debug + Clone,
380    Category: Eq + Clone + std::fmt::Debug,
381{
382    type Item = ParserOutput<'b, T>;
383
384    fn next(&mut self) -> Option<Self::Item> {
385        #[cfg(not(target_arch = "wasm32"))]
386        if self.start_time.is_none() {
387            self.start_time = Some(Instant::now());
388        }
389
390        if self.buffer.is_empty() {
391            while let Some(mut beam) = self.parse_heap.pop() {
392                #[cfg(not(target_arch = "wasm32"))]
393                if let Some(max_time) = self.config.max_time
394                    && max_time < self.start_time.unwrap().elapsed()
395                {
396                    return None;
397                }
398
399                if let Some(moment) = beam.pop_moment() {
400                    expand(
401                        &mut self.parse_heap,
402                        moment,
403                        beam,
404                        self.lexicon,
405                        self.config,
406                    );
407                } else if let Some((mut good_parses, p, rules)) =
408                    ParseScan::yield_good_parse(beam, &self.parse_heap.rule_arena)
409                    && let Some(next_sentence) = good_parses.next()
410                {
411                    self.buffer
412                        .extend(good_parses.map(|x| (p, x, rules.clone())));
413                    let next = Some((p, next_sentence, rules));
414                    return next;
415                }
416            }
417        } else {
418            return self.buffer.pop();
419        }
420
421        None
422    }
423}
424
425impl<T, Category> Lexicon<T, Category>
426where
427    T: Eq + std::fmt::Debug + Clone,
428    Category: Eq + Clone + std::fmt::Debug,
429{
430    ///Generates the strings in a grammar that are findable according to the parsing config.
431    ///
432    ///
433    ///Returns an iterator, [`Parser`] which has items of type `(`[`LogProb<f64>`]`, Vec<T>,
434    ///`[`RulePool`]`)` where the middle items are the generated strings and the [`RulePool`] has
435    ///the structure of each genrerated string.
436    pub fn generate(
437        &self,
438        category: Category,
439        config: &ParsingConfig,
440    ) -> Result<Generator<&Self, T, Category>, ParsingError<Category>> {
441        let cat = self.find_category(&category)?;
442        let beam = BeamWrapper::new(GeneratorScan { sentence: vec![] }, cat);
443        let parse_heap = ParseHeap::new(beam, config, cat);
444        Ok(Generator {
445            lexicon: self,
446            config: *config,
447            parse_heap,
448            phantom: PhantomData,
449        })
450    }
451
452    ///Like [`Lexicon::generate`] but consumes the lexicon.
453    pub fn into_generate(
454        self,
455        category: Category,
456        config: &ParsingConfig,
457    ) -> Result<Generator<Self, T, Category>, ParsingError<Category>> {
458        let cat = self.find_category(&category)?;
459        let beam = BeamWrapper::new(GeneratorScan { sentence: vec![] }, cat);
460        let parse_heap = ParseHeap::new(beam, config, cat);
461        Ok(Generator {
462            lexicon: self,
463            config: *config,
464            parse_heap,
465            phantom: PhantomData,
466        })
467    }
468
469    ///Parses a sentence under the assumption it is has the category, `category`.
470    ///
471    ///Returns an iterator, [`Parser`] which has items of type `(`[`LogProb<f64>`]`, &'a [T],
472    ///`[`RulePool`]`)`
473    pub fn parse<'a, 'b>(
474        &'a self,
475        s: &'b [PhonContent<T>],
476        category: Category,
477        config: &'a ParsingConfig,
478    ) -> Result<Parser<'a, 'b, T, Category>, ParsingError<Category>> {
479        let cat = self.find_category(&category)?;
480
481        let beam = BeamWrapper::new(
482            ParseScan {
483                sentence: vec![(s, 0)],
484            },
485            cat,
486        );
487        let parse_heap = ParseHeap::new(beam, config, cat);
488        Ok(Parser {
489            lexicon: self,
490            config,
491            #[cfg(not(target_arch = "wasm32"))]
492            start_time: None,
493            buffer: vec![],
494            parse_heap,
495        })
496    }
497
498    ///Functions like [`Lexicon::parse`] but parsing multiple grammars at once.
499    pub fn parse_multiple<'a, 'b, U>(
500        &'a self,
501        sentences: &'b [U],
502        category: Category,
503        config: &'a ParsingConfig,
504    ) -> Result<Parser<'a, 'b, T, Category>, ParsingError<Category>>
505    where
506        U: AsRef<[PhonContent<T>]>,
507    {
508        let cat = self.find_category(&category)?;
509        let beams = BeamWrapper::new(
510            ParseScan {
511                sentence: sentences.iter().map(|x| (x.as_ref(), 0)).collect(),
512            },
513            cat,
514        );
515        let parse_heap = ParseHeap::new(beams, config, cat);
516        Ok(Parser {
517            lexicon: self,
518            buffer: vec![],
519            #[cfg(not(target_arch = "wasm32"))]
520            start_time: None,
521            config,
522            parse_heap,
523        })
524    }
525
526    ///Functions like [`Lexicon::parse`] or [`Lexicon::generate`] but will return parses that are
527    ///close to the provided sentences, but are not necessarily the same.
528    pub fn fuzzy_parse<'a, 'b, U>(
529        &'a self,
530        sentences: &'b [U],
531        category: Category,
532        config: &'a ParsingConfig,
533    ) -> Result<FuzzyParser<'a, 'b, T, Category>, ParsingError<Category>>
534    where
535        U: AsRef<[PhonContent<T>]>,
536    {
537        let cat = self.find_category(&category)?;
538
539        let beams = BeamWrapper::new(
540            FuzzyScan {
541                sentence_guides: sentences.iter().map(|x| (x.as_ref(), 0)).collect(),
542                generated_sentences: vec![],
543            },
544            cat,
545        );
546
547        let parse_heap = ParseHeap::new(beams, config, cat);
548
549        Ok(FuzzyParser {
550            lexicon: self,
551            config,
552            parse_heap,
553        })
554    }
555}
556
557#[derive(Debug, Clone)]
558///An iterator constructed by [`Lexicon::parse`] and [`Lexicon::parse_multiple`]
559pub struct Generator<L, T: Eq + std::fmt::Debug + Clone, Category: Eq + Clone + std::fmt::Debug> {
560    lexicon: L,
561    phantom: PhantomData<Category>,
562    parse_heap: ParseHeap<T, GeneratorScan<T>>,
563    config: ParsingConfig,
564}
565
566impl<L, T, Category> Iterator for Generator<L, T, Category>
567where
568    L: Borrow<Lexicon<T, Category>>,
569    T: Eq + std::fmt::Debug + Clone,
570    Category: Eq + Clone + std::fmt::Debug + Hash,
571{
572    type Item = GeneratorOutput<T>;
573
574    fn next(&mut self) -> Option<Self::Item> {
575        while let Some(mut beam) = self.parse_heap.pop() {
576            if let Some(moment) = beam.pop_moment() {
577                expand(
578                    &mut self.parse_heap,
579                    moment,
580                    beam,
581                    self.lexicon.borrow(),
582                    &self.config,
583                );
584            } else if let Some(x) =
585                GeneratorScan::yield_good_parse(beam, &self.parse_heap.rule_arena)
586            {
587                return Some(x);
588            }
589        }
590        None
591    }
592}
593
594pub mod grammars;
595pub mod lexicon;
596pub mod parsing;
597
598#[cfg(test)]
599mod tests;