mardi 29 décembre 2015

Implementation of Naive Bayes for text classification in C++

I am writing a code for implementing Naive Bayes classifier for text classification. I have worked a very small example, please refer page 44, it seems to be working.

  1. But I want know whether the implementation is correct, whether it will work for other training and testing sets? I am not trying to implement a commercial level Naive Bayes, just a small assignment, to learn some C++.
  2. I want to know how the code is? Like the way I wrote the code is it a good C++ practice?
  3. I know there lot of improvements which can be done, like for example at present I am testing only one test file, so a way to test multiple files is something that I am thinking of doing in the future, also at present I am doing only 2 class classification, in the future maybe multi class classification. But anything other improvement code wise?

Here is the code, NB header file:

#pragma once

#include<iostream>
#include<fstream>
#include<string>
#include<vector>
#include<map>
using namespace std;

class NB
{
public:
    NB(NB& cl1, NB& cl2, string className);
    NB(string className);
    NB(string className, int classType);
    vector <string> combineClassText();
    void bagOfWords(string classCombine, bool isTotal = false);
    void calcProb(NB& total);
    float totalProb(NB& prob, NB& total);
    int classType;

private:
    int _len = 0;
    float _prob = 1.0f;
    int _voc = 0;
    int _nOfClass = 0;
    int _tnClass = 0;
    int _totalWordsinC = 0;
    int _wordCounter = 0;
    bool _isDone = false;
    ifstream _in;
    ofstream _out;
    //string _classCombine;
    string _className;
    string _fileName;
    vector <string> _combined;
    map<string, string> _category;
    map<string, int> _bow;
    map<string, float> _probCalc;
};

The NB.cpp file:

#include "NB.h"
#include<cmath>

NB::NB(NB& cl1, NB& cl2, string className)
{
    _className = className;
    _out.open("combineAll.txt");
    if (_out.fail()) {
        perror("cannot write to combineAll.txt");
    }
    _len = cl1.combineClassText().size();
    for (int i = 0; i < _len; i++) {
        _combined.push_back(cl1.combineClassText()[i]);
    }

    _len = cl2.combineClassText().size();
    for (int i = 0; i < _len; i++) {
        _combined.push_back(cl2.combineClassText()[i]);
    }

    _len = _combined.size();
    for (int i = 0; i < _len; i++) {
        _out << _combined[i] << endl;
        //cout << i + 1 << ". " << _combined[i] << endl;
    }
    _out.close();
    _tnClass = cl1._tnClass + cl2._tnClass;
    bagOfWords("combineAll.txt", true);
}

NB::NB(string className, int classType) {
    NB::classType = classType;
    _className = className;
    cout << "Enter a filename for " + _className << endl;
    cin >> _fileName;
    _category[_fileName] = _className;
    combineClassText();
    bagOfWords(_className + ".txt");
}

NB::NB(string className)
{
    _className = className;
    while (_isDone == false) {
        cout << "Enter a filename for " + _className << endl;
        cin >> _fileName;
        if (_fileName != "q") {
            _category[_fileName] = _className;
            _nOfClass++;
            _tnClass++;
        } else {
            _isDone = true;
        }
    }
    combineClassText();
    bagOfWords(_className + ".txt");
}

vector<string> NB::combineClassText() {

    string temp;
    string classCombine = _className + ".txt";
    vector <string> tmp;
    map<string, string>::iterator it;

    _out.open(classCombine);
    if (_out.fail()) {
        perror("cannot write to");
    }
    for (it = _category.begin(); it != _category.end(); it++) {
        _in.open(it->first);
        if (_in.fail()) {
            perror("cannot read from");
        }
        while (_in >> temp) {
            _out << temp << endl;
            tmp.push_back(temp);            
        }
        _in.close();
    }
    _out.close();
    return tmp;
}

void NB::bagOfWords(string classCombine, bool isTotal) {

    map<string, int>::iterator it;
    string temp;
    vector<string> tp;
    string name = _className + "_bow.txt";
    int len;

    _in.open(classCombine);
    if (_in.fail()) {
        perror("cannot read from");
    }

    _out.open(name);
    if (_out.fail()) {
        perror("cannot write to");
    }

    while (_in >> temp) {
        tp.push_back(temp);
    }

    for (int i = 0; i < tp.size(); i++) {
        for (int j = 0; j < tp[i].size(); j++) {
            if (tp[i][j] == '.' || tp[i][j] == ',') {
                tp[i][j] = ' ';
            }
        }
    }

    len = tp.size();
    vector<int> count(len, 1);

    for (int i = 0; i < len; i++) {
        for (int j = 0; j < (len - i - 1); j++) {
            if (tp[i] == tp[j + i + 1]) {
                count[i]++;
            }
        }
    }

    for (int i = len - 1; i >= 0; i--) {
        _bow[tp[i]] = count[i];
    }

    for (it = _bow.begin(); it != _bow.end(); it++) {
        _out << it->first << ": " << it->second << endl;
        //cout << it->first << ": " << it->second << endl;
    }
    //cout << endl;

    if (isTotal == true) {
        for (it = _bow.begin(); it != _bow.end(); it++) {
            _voc += 1;
            //cout << _voc << endl;
        }
    } else {
        for (it = _bow.begin(); it != _bow.end(); it++) {
            _totalWordsinC += it->second;
        }
        //cout << _totalWordsinC << endl;
    }
    _in.close();
    _out.close();
}

void NB::calcProb(NB& total) {

    map<string, int> ::iterator it;
    map<string, int> ::iterator it2;
    map<string, float> ::iterator it3;

        _out.open(_className + "_prob.txt");
        if (_out.fail()) {
            perror("cannot write to");
        }
        for (it = total._bow.begin(); it != total._bow.end(); it++) {
            for (it2 = _bow.begin(); it2 != _bow.end(); it2++) {
                if (it->first == it2->first) {
                    _probCalc[it->first] = (float)((it2->second) + 1) / (_totalWordsinC + total._voc);
                    break;
                } else {
                    _probCalc[it->first] = (float)(1) / (_totalWordsinC + total._voc);
                }
            }
        }

        for (it3 = _probCalc.begin(); it3 != _probCalc.end(); it3++) {
            //cout << it3->first << ": " << it3->second << endl;
            _out << it3->first << ": " << it3->second << endl;
        }
        _out.close();
    }

float NB::totalProb(NB& prob, NB& total) {

    map<string, int> ::iterator it;
    map<string, int> ::iterator it2;
    map<string, float> ::iterator it3;

    _out.open(_className + "_" + prob._className + "_prob.txt");
    if (_out.fail()) {
        perror("cannot write to");
    }
    _prob = 1.0f;
    for (it = _bow.begin(); it != _bow.end(); it++) {
        for (it3 = prob._probCalc.begin(); it3 != prob._probCalc.end(); it3++) {
            if (it->first == it3->first) {
                _wordCounter = 0;
                _prob = (_prob * pow((it3->second), (it->second)));
                break;
            } else {
                _wordCounter++;
                if (_wordCounter == prob._probCalc.size()) {
                    _prob = _prob * ((float)1 / (prob._totalWordsinC + total._voc));
                }
            }
        }
    }
    _prob = (_prob * ((float)(prob._nOfClass) / total._tnClass));
    cout << _prob << endl;
    _out << "The probalility of the " << _className << " beloning to " << prob._className << " is: " << _prob << endl;
    _out.close();
    return _prob;
}

and finally main.cpp:

#include<iostream>
#include<vector>
#include"NB.h"

using namespace std;

int main() {

    NB class1("class1");
    NB class2("class2");
    NB total(class1, class2, "all_combined");

    class1.calcProb(total);
    class2.calcProb(total);

    int nOfTestDocs = 0;
    int corrClass = 0;
    float accurancy = 0.0f;
    cout << "Enter the number of test documents\n";
    cin >> nOfTestDocs;

    NB test("test", 1);
    if (test.totalProb(class1, total) >= test.totalProb(class2, total)) {
        cout << "The test data belongs to class 1\n";
        if (test.classType == 1) {
            corrClass++;
            accurancy = (float)corrClass / nOfTestDocs;
            cout << "The accurancy is: " << accurancy << endl;
        }
    }
    else {
        cout << "The test data belongs to class 2\n";
        if (test.classType == 1) {
            corrClass++;
            accurancy = (float)corrClass / nOfTestDocs;
            cout << "The accurancy is: " << accurancy << endl;
        }
    }
    system("PAUSE");
    return 0;
}

Aucun commentaire:

Enregistrer un commentaire