minimalist_grammar_parser/
parsing.rs

1//! Module to define the core parsing algorithm used to generate or parse strings from MGs
2use std::borrow::Borrow;
3use std::cmp::Reverse;
4use std::collections::BinaryHeap;
5use std::fmt::Debug;
6use std::marker::PhantomData;
7
8use crate::lexicon::{Feature, FeatureOrLemma, LexemeId, Lexicon, ParsingError};
9use crate::parsing::rules::StolenInfo;
10use crate::parsing::trees::StolenHead;
11use crate::{Direction, ParseHeap, ParsingConfig};
12use beam::Scanner;
13use logprob::LogProb;
14use petgraph::graph::NodeIndex;
15use thin_vec::{ThinVec, thin_vec};
16pub use trees::GornIndex;
17use trees::{FutureTree, ParseMoment};
18
19use rules::Rule;
20pub use rules::RulePool;
21pub(crate) use rules::{PartialRulePool, RuleHolder, RuleIndex};
22
23#[derive(Debug, Clone)]
24struct AffixedHead {
25    heads: Vec<LexemeId>,
26    pos: usize,
27    head: usize,
28    tree: Option<FutureTree>,
29    current_index: GornIndex,
30}
31
32impl AffixedHead {
33    fn insert(&mut self, nx: LexemeId, dir: Direction) {
34        match dir {
35            Direction::Left => self.heads.insert(self.pos, nx),
36            Direction::Right => {
37                self.pos += 1;
38                self.heads.insert(self.pos, nx);
39            }
40        }
41        if self.pos <= self.head {
42            self.head += 1;
43        }
44        self.current_index.push(dir);
45    }
46
47    fn new(nx: LexemeId, tree: FutureTree) -> Self {
48        AffixedHead {
49            heads: vec![nx],
50            pos: 0,
51            head: 0,
52            tree: Some(tree),
53            current_index: GornIndex::default(),
54        }
55    }
56}
57
58#[derive(Debug, Clone)]
59pub(crate) struct BeamWrapper<T, B: Scanner<T>> {
60    log_prob: LogProb<f64>,
61    queue: BinaryHeap<Reverse<ParseMoment>>,
62    heads: ThinVec<AffixedHead>,
63    rules: PartialRulePool,
64    n_consec_empties: usize,
65    pub beam: B,
66    phantom: PhantomData<T>,
67}
68
69impl<T: Eq + std::fmt::Debug, B: Scanner<T> + Eq> PartialEq for BeamWrapper<T, B> {
70    fn eq(&self, other: &Self) -> bool {
71        self.beam == other.beam
72            && self.log_prob == other.log_prob
73            && self.queue.clone().into_sorted_vec() == other.queue.clone().into_sorted_vec()
74    }
75}
76
77impl<T: Eq + std::fmt::Debug, B: Scanner<T> + Eq> Eq for BeamWrapper<T, B> {}
78
79impl<T: Eq + std::fmt::Debug, B: Scanner<T> + Eq> PartialOrd for BeamWrapper<T, B> {
80    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
81        Some(self.cmp(other))
82    }
83}
84
85impl<T: Eq + std::fmt::Debug, B: Scanner<T> + Eq> Ord for BeamWrapper<T, B> {
86    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
87        self.log_prob.cmp(&other.log_prob)
88    }
89}
90
91impl<T: Clone + Eq + std::fmt::Debug, B: Scanner<T> + Eq + Clone> BeamWrapper<T, B> {
92    fn scan<Category: Eq + std::fmt::Debug + Clone>(
93        mut self,
94        v: &mut ParseHeap<T, B>,
95        moment: &ParseMoment,
96        s: &Option<T>,
97        child_node: LexemeId,
98        child_prob: LogProb<f64>,
99        lexicon: &Lexicon<T, Category>,
100    ) {
101        match s {
102            Some(_) => self.n_consec_empties = 0,
103            None => self.n_consec_empties += 1,
104        }
105        let rule = match moment.stolen_head {
106            StolenHead::Root(head_id) => {
107                let heads = &mut self.heads[head_id];
108                // We change this one so that the head is the lemma we are at, previously we didn't
109                // know which it was, so at that index was an affix feature.
110                heads.heads[heads.head] = child_node;
111
112                let heads = heads
113                    .heads
114                    .iter()
115                    .filter_map(|x| lexicon.leaf_to_lemma(*x).unwrap().as_ref())
116                    .collect();
117
118                if self.beam.multiscan(heads) {
119                    Some(Rule::Scan {
120                        lexeme: child_node,
121                        stolen: StolenInfo::Stealer,
122                    })
123                } else {
124                    None
125                }
126            }
127            StolenHead::Affix {
128                rule,
129                direction,
130                stealer_id,
131                is_done,
132            } => {
133                if is_done {
134                    let tree = self.heads[stealer_id].tree.take().unwrap();
135                    self.add_tree_to_queue(tree, thin_vec![], StolenHead::Root(stealer_id));
136                }
137                self.heads[stealer_id].insert(child_node, direction);
138                Some(Rule::Scan {
139                    lexeme: child_node,
140                    stolen: StolenInfo::Stolen(rule, self.heads[stealer_id].current_index),
141                })
142            }
143            StolenHead::None => {
144                if self.beam.scan(s) {
145                    Some(Rule::Scan {
146                        lexeme: child_node,
147                        stolen: StolenInfo::Normal,
148                    })
149                } else {
150                    None
151                }
152            }
153        };
154
155        if let Some(rule) = rule {
156            self.log_prob += child_prob;
157            self.rules.push_rule(v.rules_mut(), rule, moment.tree.id);
158            v.push(self);
159        }
160    }
161
162    fn new_future_tree(&mut self, node: NodeIndex, index: GornIndex) -> (FutureTree, RuleIndex) {
163        let id = self.rules.fresh();
164        (FutureTree { node, index, id }, id)
165    }
166
167    fn push_moment(
168        &mut self,
169        node: NodeIndex,
170        index: GornIndex,
171        movers: ThinVec<FutureTree>,
172        stolen_head_info: StolenHead,
173    ) -> RuleIndex {
174        let (tree, id) = self.new_future_tree(node, index);
175        self.add_tree_to_queue(tree, movers, stolen_head_info);
176        id
177    }
178
179    fn add_tree_to_queue(
180        &mut self,
181        tree: FutureTree,
182        movers: ThinVec<FutureTree>,
183        stolen_head: StolenHead,
184    ) {
185        self.queue.push(Reverse(ParseMoment {
186            tree,
187            stolen_head,
188            movers,
189        }));
190    }
191
192    pub(super) fn new(beam: B, category_index: NodeIndex) -> Self {
193        let mut queue = BinaryHeap::<Reverse<ParseMoment>>::with_capacity(5);
194        queue.push(Reverse(ParseMoment::new(
195            FutureTree {
196                node: category_index,
197                index: GornIndex::default(),
198                id: RuleIndex::one(),
199            },
200            thin_vec![],
201            StolenHead::None,
202        )));
203        BeamWrapper {
204            beam,
205            n_consec_empties: 0,
206            queue,
207            heads: thin_vec![],
208            log_prob: LogProb::prob_of_one(),
209            rules: PartialRulePool::default(),
210            phantom: PhantomData,
211        }
212    }
213
214    pub fn log_prob(&self) -> LogProb<f64> {
215        self.log_prob
216    }
217
218    pub fn n_steps(&self) -> usize {
219        self.rules.n_steps()
220    }
221
222    pub fn n_consecutive_empty(&self) -> usize {
223        self.n_consec_empties
224    }
225
226    pub fn pop_moment(&mut self) -> Option<ParseMoment> {
227        if let Some(Reverse(x)) = self.queue.pop() {
228            Some(x)
229        } else {
230            None
231        }
232    }
233
234    pub fn is_empty(&self) -> bool {
235        self.queue.is_empty()
236    }
237}
238
239fn clone_push<T: Clone>(v: &[T], x: T) -> ThinVec<T> {
240    let mut v: ThinVec<T> = ThinVec::from(v);
241    v.push(x);
242    v
243}
244
245#[allow(clippy::too_many_arguments)]
246#[inline]
247fn unmerge_from_mover<
248    T: Eq + std::fmt::Debug + Clone,
249    Category: Eq + std::fmt::Debug + Clone,
250    B: Scanner<T> + Clone + Eq,
251>(
252    v: &mut ParseHeap<T, B>,
253    lexicon: &Lexicon<T, Category>,
254    moment: &ParseMoment,
255    beam: &BeamWrapper<T, B>,
256    cat: &Category,
257    child_node: NodeIndex,
258    dir: Direction,
259    child_prob: LogProb<f64>,
260) -> bool {
261    let mut new_beam = false;
262    for mover in &moment.movers {
263        for stored_child_node in lexicon.children_of(mover.node) {
264            let (stored, stored_prob) = lexicon.get(stored_child_node).unwrap();
265            match stored {
266                FeatureOrLemma::Feature(Feature::Category(stored)) if stored == cat => {
267                    let mut beam = beam.clone();
268                    let movers = moment
269                        .movers
270                        .iter()
271                        .filter(|&v| v != mover)
272                        .copied()
273                        .collect();
274
275                    let (stored_movers, child_movers) = if lexicon.is_complement(child_node) {
276                        (movers, thin_vec![])
277                    } else {
278                        (thin_vec![], movers)
279                    };
280
281                    let child_id = beam.push_moment(
282                        child_node,
283                        moment.tree.index,
284                        child_movers,
285                        moment.stolen_head,
286                    );
287
288                    let stored_id = beam.push_moment(
289                        stored_child_node,
290                        mover.index,
291                        stored_movers,
292                        StolenHead::None,
293                    );
294
295                    beam.log_prob += stored_prob + child_prob;
296
297                    let trace_id = beam.rules.fresh_trace();
298
299                    beam.rules.push_rule(
300                        v.rules_mut(),
301                        Rule::UnmergeFromMover {
302                            child: child_node,
303                            child_id,
304                            stored_id,
305                            trace_id,
306                            destination_id: mover.id,
307                            dir,
308                        },
309                        moment.tree.id,
310                    );
311                    v.push(beam);
312                    new_beam = true;
313                }
314                _ => (),
315            }
316        }
317    }
318    new_beam
319}
320
321#[allow(clippy::too_many_arguments)]
322#[inline]
323fn unmerge<
324    T: Eq + std::fmt::Debug + Clone,
325    Category: Eq + std::fmt::Debug + Clone,
326    B: Scanner<T> + Eq + Clone,
327>(
328    v: &mut ParseHeap<T, B>,
329    lexicon: &Lexicon<T, Category>,
330    moment: &ParseMoment,
331    mut beam: BeamWrapper<T, B>,
332    cat: &Category,
333    dir: Direction,
334    child_node: NodeIndex,
335    child_prob: LogProb<f64>,
336    head_info: HeadMovement,
337) -> Result<(), ParsingError<Category>> {
338    let complement = lexicon.find_category(cat)?;
339
340    //This enforces the SpICmg constraint (it could be loosened by determining how to divide the
341    //subsets of movers)
342    let (complement_movers, child_movers) = if lexicon.is_complement(child_node) {
343        (moment.movers.clone(), thin_vec![])
344    } else {
345        (thin_vec![], moment.movers.clone())
346    };
347
348    let (child_head, comp_head) = match head_info {
349        HeadMovement::HeadMovement { stealer, stolen } => (stealer, stolen),
350        HeadMovement::Inherit => (moment.stolen_head, StolenHead::None),
351    };
352
353    let (child_tree, child_id) =
354        beam.new_future_tree(child_node, moment.tree.index.clone_push(dir.flip()));
355    let (complement_tree, complement_id) =
356        beam.new_future_tree(complement, moment.tree.index.clone_push(dir));
357
358    match (child_head, comp_head) {
359        (StolenHead::None, StolenHead::None) => {
360            //do normal stuff
361            beam.add_tree_to_queue(child_tree, child_movers, StolenHead::None);
362            beam.add_tree_to_queue(complement_tree, complement_movers, StolenHead::None);
363        }
364        (StolenHead::Root(_), stole @ StolenHead::Affix { .. }) => {
365            //First time we are stealing a head!
366            #[cfg(test)]
367            assert!(child_movers.is_empty());
368
369            beam.heads
370                .push(AffixedHead::new(LexemeId(child_node), child_tree)); //assumes that affixes are
371            //complements, e.g. that child_node is a lexeme
372            beam.add_tree_to_queue(complement_tree, complement_movers, stole.with_done());
373        }
374        (StolenHead::Root(_), StolenHead::Root(_)) => panic!("A"),
375        (stole_child @ StolenHead::Affix { .. }, stolen_comp @ StolenHead::Affix { .. }) => {
376            #[cfg(test)]
377            assert!(child_movers.is_empty());
378
379            //Previous head was stolen, and we're stealing this one too.
380            //the complement is behaving normally.
381            beam.add_tree_to_queue(child_tree, child_movers, stole_child.with_not_done());
382            beam.add_tree_to_queue(complement_tree, complement_movers, stolen_comp.with_done());
383        }
384        (stole @ StolenHead::Affix { .. }, StolenHead::None) => {
385            //We are finished stealing heads, the child should go up to be with its stealer while
386            //the complement is behaving normally.
387            #[cfg(test)]
388            assert!(child_movers.is_empty());
389            beam.add_tree_to_queue(child_tree, child_movers, stole.with_done());
390            beam.add_tree_to_queue(complement_tree, complement_movers, StolenHead::None);
391        }
392        (_, _) => panic!("Should be impossible"),
393    }
394
395    beam.log_prob += child_prob;
396    beam.rules.push_rule(
397        v.rules_mut(),
398        Rule::Unmerge {
399            child: child_node,
400            child_id,
401            complement_id,
402            dir,
403            affix: head_info != HeadMovement::Inherit, //This means we're in an affix merge
404        },
405        moment.tree.id,
406    );
407    v.push(beam);
408    Ok(())
409}
410
411#[allow(clippy::too_many_arguments)]
412#[inline]
413fn unmove_from_mover<
414    T: Eq + std::fmt::Debug + Clone,
415    Category: Eq + std::fmt::Debug + Clone,
416    B: Scanner<T> + Clone + Eq,
417>(
418    v: &mut ParseHeap<T, B>,
419    lexicon: &Lexicon<T, Category>,
420    moment: &ParseMoment,
421    beam: &BeamWrapper<T, B>,
422    cat: &Category,
423    child_node: NodeIndex,
424    child_prob: LogProb<f64>,
425    already_mover_of_this_cat: bool,
426) -> bool {
427    let mut new_beam_found = false;
428
429    for mover in moment
430        .movers
431        .iter()
432        .filter(|x| lexicon.get_feature_category(x.node) == Some(cat) || !already_mover_of_this_cat)
433    //This checks for the SMC. This is because we can't add a new moved -x if we've already got
434    //a -x in our movers. Unless of course, that -x is achieved by getting rid of a -x that is
435    //already there. E.g. if the movers are [<-w, -w>, <-x, -w>] we can turn the <-w, -w> to -w
436    //since we still will only have one -w in the movers. We can't do <-x, -w> since that would
437    //lead to two -w in the movers at the same time, violating the SMC.
438    {
439        for stored_child_node in lexicon.children_of(mover.node) {
440            let (stored, stored_prob) = lexicon.get(stored_child_node).unwrap();
441            match stored {
442                FeatureOrLemma::Feature(Feature::Licensee(s)) if cat == s => {
443                    let mut beam = beam.clone();
444                    let (stored_tree, stored_id) =
445                        beam.new_future_tree(stored_child_node, mover.index);
446
447                    let movers = moment
448                        .movers
449                        .iter()
450                        .filter(|&v| v != mover)
451                        .copied()
452                        .chain(std::iter::once(stored_tree))
453                        .collect();
454
455                    let child_id =
456                        beam.push_moment(child_node, moment.tree.index, movers, moment.stolen_head);
457                    beam.log_prob += stored_prob + child_prob;
458
459                    let trace_id = beam.rules.fresh_trace();
460                    beam.rules.push_rule(
461                        v.rules_mut(),
462                        Rule::UnmoveFromMover {
463                            child_id,
464                            stored_id,
465                            trace_id,
466                            destination_id: mover.id,
467                        },
468                        moment.tree.id,
469                    );
470                    v.push(beam);
471                    new_beam_found = true;
472                }
473                _ => (),
474            }
475        }
476    }
477    new_beam_found
478}
479
480#[allow(clippy::too_many_arguments)]
481#[inline]
482fn unmove<
483    T: Eq + std::fmt::Debug + Clone,
484    Category: Eq + std::fmt::Debug + Clone,
485    B: Scanner<T> + Eq + Clone,
486>(
487    v: &mut ParseHeap<T, B>,
488    lexicon: &Lexicon<T, Category>,
489    moment: &ParseMoment,
490    mut beam: BeamWrapper<T, B>,
491    cat: &Category,
492    child_node: NodeIndex,
493    child_prob: LogProb<f64>,
494) -> Result<(), ParsingError<Category>> {
495    let stored = lexicon.find_licensee(cat)?;
496    let (stored, stored_id) =
497        beam.new_future_tree(stored, moment.tree.index.clone_push(Direction::Left));
498
499    let child_id = beam.push_moment(
500        child_node,
501        moment.tree.index.clone_push(Direction::Right),
502        clone_push(&moment.movers, stored),
503        moment.stolen_head,
504    );
505
506    beam.log_prob += child_prob;
507    beam.rules.push_rule(
508        v.rules_mut(),
509        Rule::Unmove {
510            child_id,
511            stored_id,
512        },
513        moment.tree.id,
514    );
515    v.push(beam);
516    Ok(())
517}
518
519#[derive(Debug, Clone, Copy, Eq, PartialEq)]
520enum HeadMovement {
521    HeadMovement {
522        stealer: StolenHead,
523        stolen: StolenHead,
524    },
525    Inherit,
526}
527
528#[inline]
529fn set_beam_head<T: Eq + std::fmt::Debug + Clone, B: Scanner<T> + Eq + Clone>(
530    moment: &ParseMoment,
531    dir: Direction,
532    beam: &mut BeamWrapper<T, B>,
533) -> HeadMovement {
534    match moment.stolen_head {
535        StolenHead::Root(_) => panic!(
536            "Only possible if there are multiple head movements to one lemma which is not yet supported"
537        ),
538        StolenHead::Affix {
539            stealer_id,
540            rule,
541            direction: old_dir,
542            is_done,
543        } => HeadMovement::HeadMovement {
544            stealer: StolenHead::Affix {
545                stealer_id,
546                rule,
547                direction: old_dir,
548                is_done: false,
549            },
550            stolen: StolenHead::Affix {
551                stealer_id,
552                direction: dir,
553                rule,
554                is_done,
555            },
556        },
557        StolenHead::None => {
558            let id = beam.heads.len();
559            HeadMovement::HeadMovement {
560                stealer: StolenHead::Root(id),
561                stolen: StolenHead::new_stolen(beam.rules.peek_fresh(), dir, id),
562            }
563        }
564    }
565}
566
567pub(crate) fn expand<
568    T: Eq + std::fmt::Debug + Clone,
569    Category: Eq + std::fmt::Debug + Clone,
570    B: Scanner<T> + Eq + Clone,
571    L: Borrow<Lexicon<T, Category>>,
572>(
573    extender: &mut ParseHeap<T, B>,
574    moment: ParseMoment,
575    beam: BeamWrapper<T, B>,
576    lexicon: L,
577    config: &ParsingConfig,
578) {
579    let lexicon = lexicon.borrow();
580    let n_children = lexicon.n_children(moment.tree.node);
581    let new_beams = itertools::repeat_n(beam, n_children);
582
583    new_beams
584        .zip(lexicon.children_of(moment.tree.node))
585        .for_each(
586            |(mut beam, child_node)| match lexicon.get(child_node).unwrap() {
587                (FeatureOrLemma::Lemma(s), p) if moment.should_be_scanned() => {
588                    beam.scan(extender, &moment, s, LexemeId(child_node), p, lexicon);
589                }
590                (FeatureOrLemma::Complement(cat, dir) |
591FeatureOrLemma::Feature(Feature::Selector(cat, dir)), mut p) => {
592                    if unmerge_from_mover(
593                        extender,
594                        lexicon,
595                        &moment,
596                        &beam,
597                        cat,
598                        child_node,
599                        *dir,
600                        if lexicon.find_category(cat).is_ok() {
601                            p + config.move_prob
602                        } else {
603                            p
604                        },
605                    ) {
606                        p += config.dont_move_prob;
607                    }
608                    let _ = unmerge(
609                        extender,
610                        lexicon,
611                        &moment,
612                        beam,
613                        cat,
614                        *dir,
615                        child_node,
616                        p,
617                        HeadMovement::Inherit,
618                    );
619                }
620                (FeatureOrLemma::Feature(Feature::Licensor(cat)), mut p) => {
621                    let already_mover_of_this_cat = moment.movers.iter().any(|x| {
622                        lexicon
623                            .get_feature_category(x.node)
624                            .is_some_and(|x| x == cat)
625                    });
626                    if unmove_from_mover(
627                        extender,
628                        lexicon,
629                        &moment,
630                        &beam,
631                        cat,
632                        child_node,
633                        if lexicon.find_licensee(cat).is_ok() {
634                            p + config.move_prob
635                        } else {
636                            p
637                        },
638                        already_mover_of_this_cat,
639                    ) {
640                        p += config.dont_move_prob;
641                    }
642                    if !already_mover_of_this_cat {
643                        //This corresponds to the SMC here.
644                        let _ = unmove(extender, lexicon, &moment, beam, cat, child_node, p);
645                    }
646                }
647                (FeatureOrLemma::Feature(Feature::Affix(cat, dir)), p) => {
648                    let head_info = set_beam_head(&moment, *dir, &mut beam);
649                    //We set dir=right since headmovement is always after a right-merge, even
650                    //if you can put it to the left or right of the head
651                    let _ = unmerge(
652                        extender,
653                        lexicon,
654                        &moment,
655                        beam,
656                        cat,
657                        Direction::Right,
658                        child_node,
659                        p,
660                        head_info,
661                    );
662                }
663                (FeatureOrLemma::Lemma(_) |
664FeatureOrLemma::Feature(Feature::Category(_) | Feature::Licensee(_)), _) => (),
665                (FeatureOrLemma::Root, _) => unimplemented!("Impossible to parse the root node"),
666            },
667        );
668}
669
670pub mod beam;
671pub mod rules;
672mod trees;
673pub use trees::MAX_STEPS;
674
675#[cfg(test)]
676mod tests;