Example02.cc - FreeLing Tutorial

#include <iostream>
#include <map>
#include "freeling.h"
using namespace std;

//---------------------------------------------
// Increase the count of key w in map m, or
// set it to 1 if not found
//---------------------------------------------

void add_count (map<wstring,int> &m, const wstring &w) {
   map<wstring,int>::iterator p = m.find(w);

   // key not found in the map, add it with count=1
   if (p==m.end()) m.insert(make_pair(w,1));
   // key previously seen, increase count
   else ++p->second;
}

//---------------------------------------------
// Do whatever is needed with analyzed sentences
//---------------------------------------------

void ProcessSentences(const list<freeling::sentence> &ls) {
   // maps to count lemmas
   map<wstring,int> lemmas;
   map<wstring,int> lemma_bigrams;

   // for each sentence in the list
   for (list<freeling::sentence>::const_iterator s=ls.begin(); s!=ls.end(); ++s) {
      // previous word lemma, for the bigrams
      wstring prev=L"";
      // for each word in the sentence
      for (freeling::sentence::const_iterator w=s->begin(); w!=s->end(); ++w) {
        // count an occurrence more for the current word lemma
        add_count (lemmas, w->get_lemma());
        // count an occurrence more for the pair of previous and current word lemmas.
        if (prev!=L"")
          add_count(lemma_bigrams, prev+L" "+w->get_lemma());

        // move to next word
        prev = w->get_lemma();
      }
   }

   // once all lemmas and bigrams are counted, output the results
   wcout << L"========== LEMMA FREQUENCIES (lemma,freq) ====================" << endl;
   for (map<wstring,int>::iterator p=lemmas.begin(); p!=lemmas.end(); ++p)
      wcout << p->first << L" " << p->second << endl;  
   wcout << endl;
   wcout << L"========== LEMMA BIGRAM FREQUENCIES (lemma1,lemma2,freq) ==============" << endl;
   for (map<wstring,int>::iterator p=lemma_bigrams.begin(); p!=lemma_bigrams.end(); ++p)
      wcout << p->first << L" " << p->second << endl;  
   wcout << endl;
}

//---------------------------------------------
// Set desired options for morphological analyzer
//---------------------------------------------

freeling::maco_options my_maco_options (const wstring &lang, const wstring &lpath) {
  // create options holder 
  freeling::maco_options opt(lang);
  // Provide files for morphological submodules. Note that it is not necessary
  // to set files for modules that will not be used
  opt.UserMapFile = L"";
  opt.LocutionsFile = lpath + L"locucions.dat"; 
  opt.AffixFile = lpath + L"afixos.dat";
  opt.ProbabilityFile = lpath + L"probabilitats.dat"; 
  opt.DictionaryFile = lpath + L"dicc.src";
  opt.NPdataFile = lpath + L"np.dat"; 
  opt.PunctuationFile = lpath + L"../common/punct.dat"; 
  return opt;
}

/////////////   MAIN PROGRAM  /////////////////////

int main (int argc, char **argv) {

  // set locale to an UTF8 compatible locale
  freeling::util::init_locale(L"default");

  // get requested language from arg1, or English if not provided
  wstring lang = L"en";
  if (argc > 1) lang = freeling::util::string2wstring(argv[1]);
  // get installation path to use from arg2, or use /usr/local if not provided
  wstring ipath = L"/usr/local";
  if (argc > 2) ipath = freeling::util::string2wstring(argv[2]);

  // path to language data
  wstring lpath = ipath+L"/share/freeling/"+lang+L"/";

  // create analyzers
  freeling::tokenizer tk(lpath+L"tokenizer.dat"); 
  freeling::splitter sp(lpath+L"splitter.dat");

  // create the analyzer with the required set of maco_options
  freeling::maco_options opt = my_maco_options(lang,lpath);
  freeling::maco morfo(opt);

  // then, (de)activate required modules
  morfo.set_active_options (false,  // UserMap
                            true,  // NumbersDetection,
                            true,  // PunctuationDetection,
                            true,  // DatesDetection,
                            true,  // DictionarySearch,
                            true,  // AffixAnalysis,
                            false, // CompoundAnalysis,
                            true,  // RetokContractions,
                            true,  // MultiwordsDetection,
                            true,  // NERecognition,
                            false, // QuantitiesDetection,
                            true); // ProbabilityAssignment

  // create a hmm tagger for spanish (with retokenization ability, and forced 
  // to choose only one tag per word)
  freeling::hmm_tagger tagger(lpath+L"tagger.dat", true, FORCE_TAGGER); 


  // get all input text in a single string 
  wstring text=L"";
  wstring line;
  while (getline(wcin,line))
    text = text + line + L"\n";


  // tokenize input line into a list of words
  list<freeling::word> lw = tk.tokenize(text);
  // accumulate list of words in splitter buffer, returning a list of sentences.
  list<freeling::sentence> ls = sp.split(lw);
  // perform morphosyntactic analysis and disambiguation
  morfo.analyze(ls);
  tagger.analyze(ls);

  // do whatever is needed with processed sentences
  ProcessSentences(ls);
}
Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search