Readme¶

pymi allows you to:

  1. calculate the MI (mutual information) and AMI (average mutual information) between each pair of bigram in a given corpus.
  2. plot sentences into trees based on the MI/AMI.
  3. segment sentences based on the MI/AMI.
  4. plot the mean number of word distribution for different thresholds of MI/AMI.
In [ ]:
from pymi import PyMi
import seaborn as sns
In [ ]:
sentences = [
    ['i', 'enjoy', 'cooking', 'delicious', 'meals', 'for', 'my', 'friends', 'and', 'family'],
    ['colorless', 'green', 'sleeps', 'furiously'],
    ['the', 'sun', 'is', 'shining', 'brightly'],
    ['i', 'love', 'to', 'travel', 'and', 'explore', 'new', 'places'],
    ['music', 'makes', 'me', 'feel', 'alive'],
    ['coffee', 'is', 'my', 'go-to', 'morning', 'drink'],
    ['rainy', 'days', 'are', 'perfect', 'for', 'staying', 'in', 'and', 'reading', 'a', 'book'],
    ['hiking', 'in', 'the', 'mountains', 'is', 'a', 'great', 'way', 'to', 'unwind'],
    ['learning', 'new', 'things', 'is', 'always', 'exciting'],
    ['the', 'sound', 'of', 'waves', 'crashing', 'on', 'the', 'beach', 'is', 'so', 'soothing'],
    ['i', "can't", 'resist', 'a', 'good', 'piece', 'of', 'chocolate'],
    ['artistic', 'expression', 'is', 'a', 'beautiful', 'way', 'to', 'communicate'],
    ["i'm", 'always', 'up', 'for', 'a', 'fun', 'adventure'],
    ['the', 'smell', 'of', 'freshly', 'baked', 'bread', 'is', 'irresistible'],
    ['a', 'good', 'workout', 'can', 'boost', 'your', 'mood'],
    ['exploring', 'the', 'night', 'sky', 'with', 'a', 'telescope', 'is', 'fascinating'],
    ['spending', 'time', 'in', 'nature', 'recharges', 'my', 'energy'],
    ['singing', 'in', 'the', 'shower', 'is', 'my', 'guilty', 'pleasure'],
    ['i', 'believe', 'in', 'the', 'power', 'of', 'positive', 'thinking'],
    ['laughter', 'is', 'the', 'best', 'medicine'],
    ['helping', 'others', 'is', 'a', 'noble', 'pursuit'],
    ['dancing', 'is', 'a', 'great', 'way', 'to', 'express', 'yourself'],
    ['a', 'good', 'book', 'can', 'transport', 'you', 'to', 'another', 'world'],
    ['a', 'warm', 'cup', 'of', 'tea', 'is', 'perfect', 'for', 'a', 'cold', 'day'],
    ['i', 'enjoy', 'solving', 'challenging', 'puzzles'],
    ['animals', 'bring', 'joy', 'to', 'my', 'life'],
    ['traveling', 'allows', 'you', 'to', 'experience', 'different', 'cultures'],
    ['i', 'appreciate', 'the', 'beauty', 'of', 'a', 'starry', 'night'],
    ['meditation', 'helps', 'me', 'find', 'inner', 'peace'],
    ['spending', 'time', 'with', 'loved', 'ones', 'is', 'priceless'],
    ['learning', 'from', 'your', 'mistakes', 'is', 'important'],
    ['the', 'aroma', 'of', 'fresh', 'flowers', 'is', 'delightful'],
    ['i', 'find', 'solace', 'in', 'the', 'sound', 'of', 'a', 'babbling', 'brook'],
    ['challenges', 'make', 'us', 'stronger'],
    ['the', 'feeling', 'of', 'sand', 'between', 'your', 'toes', 'is', 'wonderful'],
    ['a', 'smile', 'can', 'brighten', "someone's", 'day'],
    ['i', 'believe', 'in', 'the', 'magic', 'of', 'the', 'universe'],
    ['sharing', 'a', 'meal', 'with', 'friends', 'is', 'a', 'special', 'moment'],
    ['learning', 'to', 'play', 'a', 'musical', 'instrument', 'is', 'fulfilling'],
    ['the', 'excitement', 'of', 'a', 'rollercoaster', 'ride', 'is', 'exhilarating'],
    ['kindness', 'is', 'a', 'virtue', 'we', 'should', 'all', 'practice'],
    ['watching', 'a', 'sunset', 'is', 'a', 'breathtaking', 'experience'],
    ['i', 'enjoy', 'the', 'thrill', 'of', 'a', 'good', 'mystery'],
    ['wandering', 'through', 'a', 'forest', 'is', 'a', 'tranquil', 'experience'],
    ['a', 'well-cooked', 'meal', 'is', 'a', 'work', 'of', 'art'],
    ['hugs', 'can', 'convey', 'more', 'than', 'words'],
    ['the', 'gentle', 'touch', 'of', 'a', 'loved', 'one', 'is', 'comforting'],
    ['a', 'good', 'movie', 'can', 'captivate', 'your', 'imagination'],
    ['candles', 'create', 'a', 'cozy', 'atmosphere']
]
In [ ]:
mi_model = PyMi(sentences, use_pickle=False)
Getting bigrams...
100%|██████████| 49/49 [00:00<00:00, 56508.36it/s]
Getting word counts...
100%|██████████| 49/49 [00:00<00:00, 147538.33it/s]
In [ ]:
mi_model.get_mi(['believe', 'in'])
Out[ ]:
0.03662289713349294
In [ ]:
mi_model.sentence_to_tree(mi_model.documents[0], type_='ami').print()
                                                  _____.0097____                             
                                                 |              |                            
              _________________________________.0207_         .0134_________                 
             |                                       |       |              |                
    _______.0223_______________________             for     my         ___.0160_____         
   |                                   |                              |             |        
 .0518__                  ___________.0309__                       friends       _.0223___   
|       |                |                  |                                   |         |  
i     enjoy         ___.0309____          meals                                and     family
                   |            |                                                            
                cooking     delicious                                                        
In [ ]:
mi_model.segment_sentence(mi_model.documents[0], type_='ami', threshold=.03, seg=' ')
Out[ ]:
['i enjoy', 'cooking delicious meals', 'for', 'my', 'friends', 'and', 'family']
In [ ]:
mi_model.sentence_to_tree(mi_model.documents[0], type_='mi').print()
     _______________________________________________________21.079_____                               
    |                                                                  |                              
 46.844_______________________________________________              42.159__________                  
|                                                     |            |                |                 
i               ___________________________________105.39_        my          ___70.266______         
               |                                          |                  |               |        
          __140.53_________________________              for              friends        _140.53___   
         |                                 |                                            |          |  
       enjoy                 ___________421.59__                                       and      family
                            |                   |                                                     
                      ___421.59____           meals                                                   
                     |             |                                                                  
                  cooking      delicious                                                              
In [ ]:
mi_model.segment_sentence(mi_model.documents[0], type_='mi', threshold=140, seg=' ')
Out[ ]:
['i', 'enjoy cooking delicious meals', 'for', 'my', 'friends', 'and family']
In [ ]:
mi_model = PyMi('demo_docs_eng.pickle', use_pickle=True)
Getting bigrams...
100%|██████████| 49/49 [00:00<00:00, 84333.56it/s]
Getting word counts...
100%|██████████| 49/49 [00:00<00:00, 347751.09it/s]
100%|██████████| 49/49 [00:00<00:00, 347751.09it/s]
In [ ]:
mi_model.save_mi_to_file(file_name='demo_docs_eng_mi_dic.pickle', type_='mi')
Found existing mi file with 281 ngrams.
0it [00:00, ?it/s]
0it [00:00, ?it/s]
In [ ]:
mi_model.mi_dic
Out[ ]:
{('good', 'book'): 42.15981012658228,
 ('staying', 'in'): 60.22830018083182,
 ('my', 'go-to'): 84.31962025316456,
 ('helping', 'others'): 421.5981012658228,
 ('us', 'stronger'): 421.5981012658228,
 ('the', 'shower'): 23.42211673699016,
 ('captivate', 'your'): 105.3995253164557,
 ('a', 'babbling'): 14.053270042194093,
 ('piece', 'of'): 30.11415009041591,
 ('are', 'perfect'): 210.7990506329114,
 ('is', 'important'): 16.863924050632914,
 ('the', 'aroma'): 23.42211673699016,
 ('of', 'sand'): 30.11415009041591,
 ('of', 'positive'): 30.11415009041591,
 ('convey', 'more'): 421.5981012658228,
 ('expression', 'is'): 16.863924050632914,
 ('cozy', 'atmosphere'): 421.5981012658228,
 ('the', 'thrill'): 23.42211673699016,
 ('in', 'and'): 20.076100060277277,
 ('animals', 'bring'): 421.5981012658228,
 ('with', 'loved'): 70.26635021097047,
 ('joy', 'to'): 52.69976265822785,
 ('we', 'should'): 421.5981012658228,
 ('telescope', 'is'): 16.863924050632914,
 ('a', 'warm'): 14.053270042194093,
 ('and', 'family'): 140.53270042194094,
 ('laughter', 'is'): 16.863924050632914,
 ('your', 'mistakes'): 105.3995253164557,
 ('smile', 'can'): 84.31962025316456,
 ('of', 'chocolate'): 30.11415009041591,
 ('more', 'than'): 421.5981012658228,
 ('good', 'movie'): 84.31962025316456,
 ('friends', 'and'): 70.26635021097047,
 ('can', 'brighten'): 84.31962025316456,
 ('reading', 'a'): 14.053270042194093,
 ('find', 'inner'): 210.7990506329114,
 ('experience', 'different'): 140.53270042194094,
 ('a', 'beautiful'): 14.053270042194093,
 ('allows', 'you'): 210.7990506329114,
 ('make', 'us'): 421.5981012658228,
 ('learning', 'from'): 140.53270042194094,
 ('sky', 'with'): 140.53270042194094,
 ('one', 'is'): 16.863924050632914,
 ('the', 'night'): 11.71105836849508,
 ('hiking', 'in'): 60.22830018083182,
 ('the', 'gentle'): 23.42211673699016,
 ('singing', 'in'): 60.22830018083182,
 ('is', 'a'): 5.059177215189874,
 ('magic', 'of'): 30.11415009041591,
 ('a', 'rollercoaster'): 14.053270042194093,
 ('to', 'my'): 10.53995253164557,
 ('my', 'friends'): 42.15981012658228,
 ('bread', 'is'): 16.863924050632914,
 ('can', 'transport'): 84.31962025316456,
 ('to', 'express'): 52.69976265822785,
 ('of', 'a'): 5.019025015069319,
 ('rainy', 'days'): 421.5981012658228,
 ('music', 'makes'): 421.5981012658228,
 ('sound', 'of'): 30.11415009041591,
 ('with', 'friends'): 70.26635021097047,
 ('excitement', 'of'): 30.11415009041591,
 ('the', 'beach'): 23.42211673699016,
 ('up', 'for'): 105.3995253164557,
 ('a', 'cozy'): 14.053270042194093,
 ('learning', 'new'): 70.26635021097047,
 ('positive', 'thinking'): 421.5981012658228,
 ('warm', 'cup'): 421.5981012658228,
 ('enjoy', 'the'): 7.8073722456633865,
 ('candles', 'create'): 421.5981012658228,
 ('recharges', 'my'): 84.31962025316456,
 ('of', 'waves'): 30.11415009041591,
 ('i', 'enjoy'): 46.84423347398032,
 ('another', 'world'): 421.5981012658228,
 ('time', 'with'): 70.26635021097047,
 ('a', 'fun'): 14.053270042194093,
 ('great', 'way'): 140.53270042194094,
 ('feeling', 'of'): 30.11415009041591,
 ('fun', 'adventure'): 421.5981012658228,
 ('can', 'boost'): 84.31962025316456,
 ('sharing', 'a'): 14.053270042194093,
 ('is', 'always'): 8.431962025316457,
 ('shower', 'is'): 16.863924050632914,
 ('believe', 'in'): 60.22830018083182,
 ('so', 'soothing'): 421.5981012658228,
 ('is', 'fulfilling'): 16.863924050632914,
 ('morning', 'drink'): 421.5981012658228,
 ('i', 'find'): 23.42211673699016,
 ('watching', 'a'): 14.053270042194093,
 ('i', 'appreciate'): 46.84423347398032,
 ('breathtaking', 'experience'): 140.53270042194094,
 ('way', 'to'): 52.699762658227854,
 ('bring', 'joy'): 421.5981012658228,
 ('loved', 'ones'): 210.7990506329114,
 ('a', 'sunset'): 14.053270042194093,
 ('things', 'is'): 16.863924050632914,
 ('sunset', 'is'): 16.863924050632914,
 ('a', 'great'): 14.053270042194093,
 ('to', 'unwind'): 52.69976265822785,
 ('than', 'words'): 421.5981012658228,
 ('to', 'experience'): 17.566587552742618,
 ('challenging', 'puzzles'): 421.5981012658228,
 ('best', 'medicine'): 421.5981012658228,
 ('in', 'nature'): 60.22830018083182,
 ('is', 'shining'): 16.863924050632914,
 ('traveling', 'allows'): 421.5981012658228,
 ('good', 'mystery'): 84.31962025316456,
 ('meditation', 'helps'): 421.5981012658228,
 ('smell', 'of'): 30.11415009041591,
 ('cup', 'of'): 30.11415009041591,
 ('a', 'tranquil'): 14.053270042194093,
 ('special', 'moment'): 421.5981012658228,
 ('to', 'play'): 52.69976265822785,
 ('a', 'breathtaking'): 14.053270042194093,
 ('coffee', 'is'): 16.863924050632914,
 ('the', 'feeling'): 23.42211673699016,
 ('exploring', 'the'): 23.42211673699016,
 ('my', 'life'): 84.31962025316456,
 ('appreciate', 'the'): 23.42211673699016,
 ('colorless', 'green'): 421.5981012658228,
 ('flowers', 'is'): 16.863924050632914,
 ('meals', 'for'): 105.3995253164557,
 ('new', 'things'): 210.7990506329114,
 ('for', 'a'): 7.0266350210970465,
 ("i'm", 'always'): 210.7990506329114,
 ('a', 'book'): 7.0266350210970465,
 ('your', 'mood'): 105.3995253164557,
 ('my', 'energy'): 84.31962025316456,
 ('mountains', 'is'): 16.863924050632914,
 ('tea', 'is'): 16.863924050632914,
 ("can't", 'resist'): 421.5981012658228,
 ('is', 'perfect'): 8.431962025316457,
 ('hugs', 'can'): 84.31962025316456,
 ('noble', 'pursuit'): 421.5981012658228,
 ('is', 'priceless'): 16.863924050632914,
 ("someone's", 'day'): 210.7990506329114,
 ('well-cooked', 'meal'): 210.7990506329114,
 ('inner', 'peace'): 421.5981012658228,
 ('for', 'staying'): 105.3995253164557,
 ('is', 'my'): 6.745569620253166,
 ('makes', 'me'): 210.7990506329114,
 ('good', 'piece'): 84.31962025316456,
 ('brighten', "someone's"): 421.5981012658228,
 ('new', 'places'): 210.7990506329114,
 ('you', 'to'): 52.69976265822785,
 ('crashing', 'on'): 421.5981012658228,
 ('a', 'musical'): 14.053270042194093,
 ('a', 'well-cooked'): 14.053270042194093,
 ('freshly', 'baked'): 421.5981012658228,
 ('dancing', 'is'): 16.863924050632914,
 ('i', 'love'): 46.84423347398032,
 ('different', 'cultures'): 421.5981012658228,
 ('a', 'work'): 14.053270042194093,
 ('the', 'excitement'): 23.42211673699016,
 ('is', 'irresistible'): 16.863924050632914,
 ('perfect', 'for'): 105.3995253164557,
 ('express', 'yourself'): 421.5981012658228,
 ('days', 'are'): 421.5981012658228,
 ('beach', 'is'): 16.863924050632914,
 ('beautiful', 'way'): 140.53270042194094,
 ('power', 'of'): 30.11415009041591,
 ('is', 'the'): 0.9368846694796062,
 ('should', 'all'): 421.5981012658228,
 ('to', 'travel'): 52.69976265822785,
 ('baked', 'bread'): 421.5981012658228,
 ('forest', 'is'): 16.863924050632914,
 ('time', 'in'): 30.11415009041591,
 ('of', 'freshly'): 30.11415009041591,
 ('nature', 'recharges'): 421.5981012658228,
 ('of', 'the'): 1.6730083383564396,
 ('feel', 'alive'): 421.5981012658228,
 ('the', 'magic'): 23.42211673699016,
 ('green', 'sleeps'): 421.5981012658228,
 ('on', 'the'): 23.42211673699016,
 ('solving', 'challenging'): 421.5981012658228,
 ('sand', 'between'): 421.5981012658228,
 ('helps', 'me'): 210.7990506329114,
 ('is', 'wonderful'): 16.863924050632914,
 ('instrument', 'is'): 16.863924050632914,
 ('shining', 'brightly'): 421.5981012658228,
 ('sun', 'is'): 16.863924050632914,
 ('wandering', 'through'): 421.5981012658228,
 ('good', 'workout'): 84.31962025316456,
 ('is', 'fascinating'): 16.863924050632914,
 ('of', 'art'): 30.11415009041591,
 ('a', 'good'): 14.053270042194097,
 ('i', "can't"): 46.84423347398032,
 ('a', 'virtue'): 14.053270042194093,
 ('guilty', 'pleasure'): 421.5981012658228,
 ('a', 'starry'): 14.053270042194093,
 ('starry', 'night'): 210.7990506329114,
 ('meal', 'with'): 70.26635021097047,
 ('is', 'delightful'): 16.863924050632914,
 ('the', 'power'): 23.42211673699016,
 ('the', 'smell'): 23.42211673699016,
 ('virtue', 'we'): 421.5981012658228,
 ('and', 'reading'): 140.53270042194094,
 ('a', 'smile'): 14.053270042194093,
 ('from', 'your'): 105.3995253164557,
 ('resist', 'a'): 14.053270042194093,
 ('solace', 'in'): 60.22830018083182,
 ('mistakes', 'is'): 16.863924050632914,
 ('a', 'forest'): 14.053270042194093,
 ('through', 'a'): 14.053270042194093,
 ('the', 'best'): 23.42211673699016,
 ('me', 'find'): 105.3995253164557,
 ('the', 'sound'): 23.42211673699016,
 ('your', 'toes'): 105.3995253164557,
 ('a', 'loved'): 7.0266350210970465,
 ('of', 'fresh'): 30.11415009041591,
 ('touch', 'of'): 30.11415009041591,
 ('movie', 'can'): 84.31962025316456,
 ('always', 'up'): 210.7990506329114,
 ('ones', 'is'): 16.863924050632914,
 ('cold', 'day'): 210.7990506329114,
 ('others', 'is'): 16.863924050632914,
 ('love', 'to'): 52.69976265822785,
 ('me', 'feel'): 210.7990506329114,
 ('and', 'explore'): 140.53270042194094,
 ('of', 'tea'): 30.11415009041591,
 ('rollercoaster', 'ride'): 421.5981012658228,
 ('for', 'my'): 21.07990506329114,
 ('i', 'believe'): 46.84423347398032,
 ('workout', 'can'): 84.31962025316456,
 ('spending', 'time'): 210.7990506329114,
 ('find', 'solace'): 210.7990506329114,
 ('babbling', 'brook'): 421.5981012658228,
 ('musical', 'instrument'): 421.5981012658228,
 ('is', 'exhilarating'): 16.863924050632914,
 ('is', 'comforting'): 16.863924050632914,
 ('your', 'imagination'): 105.3995253164557,
 ('explore', 'new'): 210.7990506329114,
 ('travel', 'and'): 140.53270042194094,
 ('challenges', 'make'): 421.5981012658228,
 ('beauty', 'of'): 30.11415009041591,
 ('create', 'a'): 14.053270042194093,
 ('to', 'another'): 52.69976265822785,
 ('delicious', 'meals'): 421.5981012658228,
 ('sleeps', 'furiously'): 421.5981012658228,
 ('the', 'sun'): 23.42211673699016,
 ('work', 'of'): 30.11415009041591,
 ('enjoy', 'cooking'): 140.53270042194094,
 ('transport', 'you'): 210.7990506329114,
 ('a', 'noble'): 14.053270042194093,
 ('the', 'universe'): 23.42211673699016,
 ('enjoy', 'solving'): 140.53270042194094,
 ('is', 'so'): 16.863924050632914,
 ('a', 'telescope'): 14.053270042194093,
 ('my', 'guilty'): 84.31962025316456,
 ('fresh', 'flowers'): 421.5981012658228,
 ('kindness', 'is'): 16.863924050632914,
 ('tranquil', 'experience'): 140.53270042194094,
 ('to', 'communicate'): 52.69976265822785,
 ('cooking', 'delicious'): 421.5981012658228,
 ('waves', 'crashing'): 421.5981012658228,
 ('boost', 'your'): 105.3995253164557,
 ('can', 'convey'): 84.31962025316456,
 ('always', 'exciting'): 210.7990506329114,
 ('go-to', 'morning'): 421.5981012658228,
 ('loved', 'one'): 210.7990506329114,
 ('night', 'sky'): 210.7990506329114,
 ('the', 'beauty'): 23.42211673699016,
 ('in', 'the'): 16.7300833835644,
 ('with', 'a'): 4.684423347398032,
 ('toes', 'is'): 16.863924050632914,
 ('a', 'special'): 14.053270042194093,
 ('play', 'a'): 14.053270042194093,
 ('gentle', 'touch'): 421.5981012658228,
 ('book', 'can'): 42.15981012658228,
 ('ride', 'is'): 16.863924050632914,
 ('learning', 'to'): 17.566587552742618,
 ('the', 'mountains'): 23.42211673699016,
 ('aroma', 'of'): 30.11415009041591,
 ('a', 'meal'): 7.0266350210970465,
 ('thrill', 'of'): 30.11415009041591,
 ('artistic', 'expression'): 421.5981012658228,
 ('between', 'your'): 105.3995253164557,
 ('meal', 'is'): 8.431962025316457,
 ('a', 'cold'): 14.053270042194093,
 ('friends', 'is'): 8.431962025316457,
 ('can', 'captivate'): 84.31962025316456,
 ('all', 'practice'): 421.5981012658228}
In [ ]:
distribution = mi_model.get_distribution(0, 1000, type_='mi')
threshold:1000.0; word percentage:last––0.14243823070353684 this––0.14243823070353684: 100%|██████████| 150/150 [00:00<00:00, 552.55it/s]           
In [ ]:
sns.lineplot(
    x=distribution['threshold'],
    y=distribution['mean_word_percentage']
)
Out[ ]:
<Axes: >
In [ ]: