mardi 29 décembre 2015

Implementation of Naive Bayes for text classification in C++

I am writing a code for implementing Naive Bayes classifier for text classification. I have worked a very small example, please refer page 44, it seems to be working.

  1. But I want know whether the implementation is correct, whether it will work for other training and testing sets? I am not trying to implement a commercial level Naive Bayes, just a small assignment, to learn some C++.
  2. I want to know how the code is? Like the way I wrote the code is it a good C++ practice?
  3. I know there lot of improvements which can be done, like for example at present I am testing only one test file, so a way to test multiple files is something that I am thinking of doing in the future, also at present I am doing only 2 class classification, in the future maybe multi class classification. But anything other improvement code wise?

Here is the code, NB header file:

#pragma once

using namespace std;

class NB
    NB(NB& cl1, NB& cl2, string className);
    NB(string className);
    NB(string className, int classType);
    vector <string> combineClassText();
    void bagOfWords(string classCombine, bool isTotal = false);
    void calcProb(NB& total);
    float totalProb(NB& prob, NB& total);
    int classType;

    int _len = 0;
    float _prob = 1.0f;
    int _voc = 0;
    int _nOfClass = 0;
    int _tnClass = 0;
    int _totalWordsinC = 0;
    int _wordCounter = 0;
    bool _isDone = false;
    ifstream _in;
    ofstream _out;
    //string _classCombine;
    string _className;
    string _fileName;
    vector <string> _combined;
    map<string, string> _category;
    map<string, int> _bow;
    map<string, float> _probCalc;

The NB.cpp file:

#include "NB.h"

NB::NB(NB& cl1, NB& cl2, string className)
    _className = className;"combineAll.txt");
    if ( {
        perror("cannot write to combineAll.txt");
    _len = cl1.combineClassText().size();
    for (int i = 0; i < _len; i++) {

    _len = cl2.combineClassText().size();
    for (int i = 0; i < _len; i++) {

    _len = _combined.size();
    for (int i = 0; i < _len; i++) {
        _out << _combined[i] << endl;
        //cout << i + 1 << ". " << _combined[i] << endl;
    _tnClass = cl1._tnClass + cl2._tnClass;
    bagOfWords("combineAll.txt", true);

NB::NB(string className, int classType) {
    NB::classType = classType;
    _className = className;
    cout << "Enter a filename for " + _className << endl;
    cin >> _fileName;
    _category[_fileName] = _className;
    bagOfWords(_className + ".txt");

NB::NB(string className)
    _className = className;
    while (_isDone == false) {
        cout << "Enter a filename for " + _className << endl;
        cin >> _fileName;
        if (_fileName != "q") {
            _category[_fileName] = _className;
        } else {
            _isDone = true;
    bagOfWords(_className + ".txt");

vector<string> NB::combineClassText() {

    string temp;
    string classCombine = _className + ".txt";
    vector <string> tmp;
    map<string, string>::iterator it;;
    if ( {
        perror("cannot write to");
    for (it = _category.begin(); it != _category.end(); it++) {>first);
        if ( {
            perror("cannot read from");
        while (_in >> temp) {
            _out << temp << endl;
    return tmp;

void NB::bagOfWords(string classCombine, bool isTotal) {

    map<string, int>::iterator it;
    string temp;
    vector<string> tp;
    string name = _className + "_bow.txt";
    int len;;
    if ( {
        perror("cannot read from");
    if ( {
        perror("cannot write to");

    while (_in >> temp) {

    for (int i = 0; i < tp.size(); i++) {
        for (int j = 0; j < tp[i].size(); j++) {
            if (tp[i][j] == '.' || tp[i][j] == ',') {
                tp[i][j] = ' ';

    len = tp.size();
    vector<int> count(len, 1);

    for (int i = 0; i < len; i++) {
        for (int j = 0; j < (len - i - 1); j++) {
            if (tp[i] == tp[j + i + 1]) {

    for (int i = len - 1; i >= 0; i--) {
        _bow[tp[i]] = count[i];

    for (it = _bow.begin(); it != _bow.end(); it++) {
        _out << it->first << ": " << it->second << endl;
        //cout << it->first << ": " << it->second << endl;
    //cout << endl;

    if (isTotal == true) {
        for (it = _bow.begin(); it != _bow.end(); it++) {
            _voc += 1;
            //cout << _voc << endl;
    } else {
        for (it = _bow.begin(); it != _bow.end(); it++) {
            _totalWordsinC += it->second;
        //cout << _totalWordsinC << endl;

void NB::calcProb(NB& total) {

    map<string, int> ::iterator it;
    map<string, int> ::iterator it2;
    map<string, float> ::iterator it3; + "_prob.txt");
        if ( {
            perror("cannot write to");
        for (it = total._bow.begin(); it != total._bow.end(); it++) {
            for (it2 = _bow.begin(); it2 != _bow.end(); it2++) {
                if (it->first == it2->first) {
                    _probCalc[it->first] = (float)((it2->second) + 1) / (_totalWordsinC + total._voc);
                } else {
                    _probCalc[it->first] = (float)(1) / (_totalWordsinC + total._voc);

        for (it3 = _probCalc.begin(); it3 != _probCalc.end(); it3++) {
            //cout << it3->first << ": " << it3->second << endl;
            _out << it3->first << ": " << it3->second << endl;

float NB::totalProb(NB& prob, NB& total) {

    map<string, int> ::iterator it;
    map<string, int> ::iterator it2;
    map<string, float> ::iterator it3; + "_" + prob._className + "_prob.txt");
    if ( {
        perror("cannot write to");
    _prob = 1.0f;
    for (it = _bow.begin(); it != _bow.end(); it++) {
        for (it3 = prob._probCalc.begin(); it3 != prob._probCalc.end(); it3++) {
            if (it->first == it3->first) {
                _wordCounter = 0;
                _prob = (_prob * pow((it3->second), (it->second)));
            } else {
                if (_wordCounter == prob._probCalc.size()) {
                    _prob = _prob * ((float)1 / (prob._totalWordsinC + total._voc));
    _prob = (_prob * ((float)(prob._nOfClass) / total._tnClass));
    cout << _prob << endl;
    _out << "The probalility of the " << _className << " beloning to " << prob._className << " is: " << _prob << endl;
    return _prob;

and finally main.cpp:


using namespace std;

int main() {

    NB class1("class1");
    NB class2("class2");
    NB total(class1, class2, "all_combined");


    int nOfTestDocs = 0;
    int corrClass = 0;
    float accurancy = 0.0f;
    cout << "Enter the number of test documents\n";
    cin >> nOfTestDocs;

    NB test("test", 1);
    if (test.totalProb(class1, total) >= test.totalProb(class2, total)) {
        cout << "The test data belongs to class 1\n";
        if (test.classType == 1) {
            accurancy = (float)corrClass / nOfTestDocs;
            cout << "The accurancy is: " << accurancy << endl;
    else {
        cout << "The test data belongs to class 2\n";
        if (test.classType == 1) {
            accurancy = (float)corrClass / nOfTestDocs;
            cout << "The accurancy is: " << accurancy << endl;
    return 0;

Aucun commentaire:

Enregistrer un commentaire