If given a threshold of 0.30 and a formula of #(A,B)/#A how can I compute the threshold value of each item in the data file

Below is my Association Rules Market Basket C++ Code & The Data File and Output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#include <iostream>

#include <map>  

#include <iterator>

#include <algorithm>

#include <string>

#include <cstdlib>

#include <fstream>

#include <sstream>

using namespace std;

string toLower(string s) {
    
    string data = s;
    
    std::transform(data.begin(), data.end(), data.begin(), ::tolower);
    
    return data;
    
}

string trim(string str){
    
    stringstream trimmer;
    
    trimmer << str;
    
    str.clear();
    
    trimmer >> str;
    
    return str;
    
}

int main()

{
    
    map<string, int> item_count;
    
    
    
    string line; // variable used to read a line from file
    
    int lines =0; // variable conatining the number of transactions
    
    
    
    ifstream myfile("/Users/TheKid/Library/Mobile Documents/com~apple~TextEdit/Documents/en.lproj/marketdata.txt");
    
    if (myfile.is_open()) // checking if the file was opened
        
    {
        
        getline(myfile,line); // to ignore the first line which contains number of transactions
        
        while(getline(myfile,line)) //read the file line by line until end of file is reached
            
        {
            
            // now we are processign each line
            
            stringstream ss(line);
            
            int i;
            
            string item;
            
            int count;
            
            // ignore Transactions ID, #of Items
            
            getline(ss, item, ',');
            
            getline(ss, item, ',');
            
            count = atoi(item.c_str());
            
            while (count-- && getline(ss, item, ',')) {
                
                item = trim(toLower(item));
                
                // Now the item variable is containing the item name
                
                map<string,int>::iterator itr = item_count.find(item);
                
                if (itr == item_count.end() ) {
                    
                    // means the element is not present
                    
                    item_count.insert(pair<string,int>(item,1));
                    
                } else {
                    
                    // increment the count
                    
                    itr->second = 1 + itr->second;
                    
                }
                
            }
            
        }
        
        // now traverse in the array and print entries which have count 1
        
        cout << "unique items: " << endl;
        
        for( map<string,int>::const_iterator it = item_count.begin(); it != item_count.end(); ++it ) {
            
            if(it->second == 1) {
                
                cout << it->first << endl;
                
            }
            
        }
        
        cout << endl << "Items frequencies: " << endl;
        
        for( map<string,int>::const_iterator it = item_count.begin(); it != item_count.end(); ++it ) {
            
            cout << it->first << ":" << it->second << endl;
            
        }
        
        myfile.close(); //closing the file
        
    } else{
        
        cout << "Unable to open input file." << endl;
        
        return 1;
        
    }
    
    return 0;
    
}

****Data File*****
20 // Number of Transactions

1, 2, gum, bread //transaction id, # of items purchased, item1, item2,...

2, 2, gum, napkin

3, 3, fruit, bread, fork

4, 7, milk, bread, napkin, fork, juice, soup, fruit

5, 4, juice, napkin, milk, bread

6, 2, bread, spoon

7, 1, juice

8, 3, juice, napkin, milk

9, 4, juice, napkin, milk, turkey

10, 1, pork

11, 10, milk, egg, bread, napkin, spoon, chicken, water, juice, soup, fork

12, 1, egg

13, 2, bread, milk,

14, 5, gum, salt, fruit, vegetable, bread

15, 1, bread

16, 2, bread, milk

17, 3, bread, salt, egg

18, 2, milk, napkin

19, 3, turkey, juice, milk

20, 3, turkey, chicken, soup

***Output***
unique items:
pork
vegetable
water

Items frequencies:
bread:11
chicken:2
egg:3
fork:3
fruit:3
gum:3
juice:7
milk:9
napkin:7
pork:1
salt:2
soup:3
spoon:2
turkey:3
vegetable:1
water:1
Program ended with exit code: 0
Last edited on
First of all, please edit your post and add code tags, at least for the C++ source file. Highlight the contents of the file and click the "<>" format button to add the tags that are needed.

Second, you never asked a question. The title of the thread implies that you need to calculate some sort of threshold value, but you never really state what threshold you are looking to calculate. (Are you trying to calculate the ratio of unique items to total items? I really can't tell from your description.)

You need to tell us what you are specifically trying to do ("I am trying to calculate average acceleration given distance traveled at different times") and what you are currently struggling with. If you are getting erroneous erroneous results, tells us what you are expecting vs. what you are getting.

Merely showing us your code, input and output is not sufficient for us, even when our ESP is working properly.
The instructions are compute the threshold value based on a given threshold computation method and value by the user. I have the amount of times each item was bought in the output but cant figure out the Threshold computation method.

Below is the example given
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
7   // # of transactions 

1, 3, apple, gum, juice //transaction id, # of items in transaction, item1, item2,....

2, 2, banana, napkins

3, 4, napkin, apple, juice, bread

4, 1, bread

5, 3, utensil, tooth brush, tooth paste

6, 2, apple, bread

7, 5, banana, bread, gum, tooth paste, juice 

Choose the threshold computation method
1. #(A,B)/A# or 2. (#(A,B)/#A)/#transactions

Threshold: 0.30
banana----> apple, 0.30
napkin ----> apple, 0.45
bread -----> gum, 0.60
Last edited on
The instructions are compute the threshold value based on a given threshold computation method and value by the user.


Without context, this sentence is pretty much meaningless. That's what I would like you to explain to me. I can figure out how to calculate whatever you need, but I don't understand what you mean by "threshold". A threshold is a value such that if a result (a quantity, a difference, a calculated value, etc.) is greater than or equal to the threshold value, one thing happens. If not, something else (or nothing) happens. Apparently, if the calculation is greater than the user input (.30 in the example), you print out a pair of food items and the calculated value.

I have the amount of times each item was bought in the output but cant figure out the Threshold computation method.


What are we calculating the threshold of? Your statement and the calculation methods appear to be dividing quantities of things. I can make a guess of what they are, but it would only be a guess.

What do #(A,B) and A# (or #A) represent? If you can put that into words, you are half-way home. If you don't know what these expressions mean, you need to get an explanation from your instructor.
Sorry he didn't clarify the meaning of them but I did some critical thinking on my own to identify
#=Threshold 0.30
A= number of transactions with two same items
B= number of transactions with one item
The equation is = Threshold x (number of transactions having both two A,B items)/(number of transactions having A in their transaction )..

For example using my data file the equation is...
Threshold x (number of transactions having both gum and bread)/(number of transactions having gum)..

Which in numerical form is 0.30(12/3)=1.2

1, 2, gum, bread //transaction id, # of items purchased, item1, item2,...

2, 2, gum, napkin

3, 3, fruit, bread, fork

4, 7, milk, bread, napkin, fork, juice, soup, fruit

5, 4, juice, napkin, milk, bread

6, 2, bread, spoon

7, 1, juice

8, 3, juice, napkin, milk

9, 4, juice, napkin, milk, turkey

10, 1, pork

11, 10, milk, egg, bread, napkin, spoon, chicken, water, juice, soup, fork

12, 1, egg

13, 2, bread, milk,

14, 5, gum, salt, fruit, vegetable, bread

15, 1, bread

16, 2, bread, milk

17, 3, bread, salt, egg

18, 2, milk, napkin

19, 3, turkey, juice, milk

20, 3, turkey, chicken, soup
Last edited on
@Mikewill203

Have another look at what you have written here.
A= number of transactions with two same items
B= number of transactions with one item
The equation is = Threshold x (number of transactions having both two A,B items)/(number of transactions having A in their transaction )..


Then please come back with something more coherent. Perhaps then somebody would be able to help you.

Maybe you could also explain what your bizarre notation
banana----> apple, 0.30

is supposed to mean.

Last edited on
Im going off the example provided to us the exact same way, However I wrote my explanation wrong:
A=How many times apple appears in Data file
B=How many times napkin appears in Data file
Threshold : 0.30
napkin ----> apple = 0.45 is how the professor typed it no clue why
0.30(3/2)=0.45


7 // # of transactions

1, 3, apple, gum, juice //transaction id, # of items in transaction, item1, item2,....

2, 2, banana, napkin

3, 4, napkin, apple, juice, bread

4, 1, bread

5, 3, utensil, tooth brush, tooth paste

6, 2, apple, bread

7, 5, banana, bread, gum, tooth paste, juice
Last edited on
Sorry, @mikewill203, but you are still making zero sense. For the dataset of 7 transactions that you have listed above I see:
1 transaction with BOTH napkin and apple in;
2 transactions with apples but no napkin;
1 transaction with napkins but no apple.

or, maybe, looked at another way,
3 transactions with apples in;
2 transactions with napkins in.

I don't see any sensible way of getting (3/2) from this (other than apples/napkins - but that's nothing like you have been stating).


If I saw some thing like
napkin ----> apple
I might just about translate it into the probability that "napkin implied apple",
or, according to probability theory,
P(apple intersection napkin) / P(napkin)
... but, sadly, that works out as 1/2.
Last edited on
I will type exactly as what is on the paper
Association Rules Program based on a given threshold computation method and value by the user.

Your program will read in the data from the text file below in this exact format:

7 // # of transactions

1, 3, apple, gum, juice //transaction id, # of items in transaction, item1, item2,....

2, 2, banana, napkin

3, 4, napkin, apple, juice, bread

4, 1, bread

5, 3, utensil, tooth brush, tooth paste

6, 2, apple, bread

7, 5, banana, bread, gum, tooth paste, juice

Program will print the following
unique items:
Item Frequencies:
Threshold Computation:

Choose the threshold computation method:
Either 1. #(A,B)/#A or 2. (#(A,B)/#A)/#transactions

Enter E to exit
You have chosen the ..... method to compute the threshold values for association
Now enter the threshold values you would like to see. Enter E to exit

Threshold: 0.30
Association rules:
banana ----> apple, 0.30
napkin ----> apple, 0.45
bread -----> gum, 0.60

When we asked him to explain the computation method to us his exact words
"it is impossible to get the given values at best they might just be examples,I will explain the computation method to you

for napkin----->apple

threshold=(number of transactions having both napkin and apple)/(number of transactions having napkin)"
@mikewill,

Go back to your teacher/lecturer/professor and ask him (a) to check his numbers and, in particular, his dataset; (b) to provide a clearer explanation.

@doug4 has already given a very clear explanation of what the word "threshold" means in English. I suspect that a threshold of 0.3 here means that your should list all pairs with an association (effectively, conditional probability) of 0.3 or greater. You don't have to multiply by this threshold. Why would you do that?

The problem is that I simply don't see how you come up with those numbers for that dataset:
banana ----> apple should be #(banana AND apple) / #(banana), which is 0. In that dataset, nobody who bought a banana also bought an apple.

napkin -----> apple should be #(napkin AND apple) / #(napkin), which is 1/2

bread -----> gum should be #(bread AND gum) / #(bread), which is 1/4


Also, if there are 7 items in the dataset I can't see how you would end up with decimal fractions like 0.30, 0.45 and 0.60. Much more likely is that it is based on a dataset with 20 elements.

I think somebody has muddled up the wrong dataset in that example.
Last edited on
Well, assuming that your definition (of association rule 1 at least) is

A -----> B, value
with
value = number of (A AND B) / number of A

and you simply write out any association for which value is greater than or equal to threshold.

I used a map<string,int> to count frequencies and a map<pair<string,string>,int> to count paired frequencies and came up with the following associations for your 20-item dataset.

Unique items:
pork
vegetable
water

Item frequencies:
bread: 11
chicken: 2
egg: 3
fork: 3
fruit: 3
gum: 3
juice: 7
milk: 9
napkin: 7
pork: 1
salt: 2
soup: 3
spoon: 2
turkey: 3
vegetable: 1
water: 1

Threshold: 0.3
fork ------> water, 0.33
turkey ------> chicken, 0.33
egg ------> fork, 0.33
fork ------> egg, 0.33
egg ------> juice, 0.33
egg ------> napkin, 0.33
egg ------> salt, 0.33
egg ------> soup, 0.33
soup ------> egg, 0.33
egg ------> spoon, 0.33
egg ------> water, 0.33
gum ------> vegetable, 0.33
fork ------> spoon, 0.33
egg ------> milk, 0.33
fruit ------> gum, 0.33
gum ------> fruit, 0.33
fruit ------> juice, 0.33
fruit ------> milk, 0.33
fruit ------> napkin, 0.33
fruit ------> salt, 0.33
fruit ------> soup, 0.33
soup ------> fruit, 0.33
fruit ------> vegetable, 0.33
gum ------> salt, 0.33
gum ------> napkin, 0.33
soup ------> water, 0.33
turkey ------> soup, 0.33
turkey ------> napkin, 0.33
soup ------> turkey, 0.33
soup ------> spoon, 0.33
fork ------> chicken, 0.33
egg ------> chicken, 0.33
juice ------> bread, 0.43
napkin ------> bread, 0.43
bread ------> milk, 0.45
spoon ------> soup, 0.50
chicken ------> turkey, 0.50
chicken ------> egg, 0.50
spoon ------> fork, 0.50
salt ------> vegetable, 0.50
salt ------> gum, 0.50
spoon ------> juice, 0.50
salt ------> fruit, 0.50
spoon ------> water, 0.50
chicken ------> bread, 0.50
chicken ------> milk, 0.50
spoon ------> chicken, 0.50
chicken ------> water, 0.50
chicken ------> spoon, 0.50
spoon ------> napkin, 0.50
chicken ------> napkin, 0.50
salt ------> egg, 0.50
chicken ------> juice, 0.50
chicken ------> fork, 0.50
spoon ------> egg, 0.50
spoon ------> milk, 0.50
milk ------> bread, 0.56
turkey ------> juice, 0.67
egg ------> bread, 0.67
soup ------> milk, 0.67
soup ------> napkin, 0.67
milk ------> napkin, 0.67
turkey ------> milk, 0.67
milk ------> juice, 0.67
soup ------> juice, 0.67
fork ------> napkin, 0.67
gum ------> bread, 0.67
soup ------> chicken, 0.67
fork ------> fruit, 0.67
fruit ------> fork, 0.67
soup ------> bread, 0.67
fork ------> juice, 0.67
fork ------> milk, 0.67
fork ------> soup, 0.67
soup ------> fork, 0.67
napkin ------> juice, 0.71
juice ------> napkin, 0.71
juice ------> milk, 0.86
napkin ------> milk, 0.86
water ------> soup, 1.00
spoon ------> bread, 1.00
vegetable ------> salt, 1.00
salt ------> bread, 1.00
fruit ------> bread, 1.00
water ------> chicken, 1.00
fork ------> bread, 1.00
water ------> spoon, 1.00
vegetable ------> bread, 1.00
water ------> napkin, 1.00
water ------> bread, 1.00
chicken ------> soup, 1.00
water ------> milk, 1.00
vegetable ------> fruit, 1.00
water ------> egg, 1.00
water ------> juice, 1.00
vegetable ------> gum, 1.00
water ------> fork, 1.00
Last edited on
I was able to see how to count the item frequencies using my code from above but how exactly did you get the paired frequencies ? using map<pair<string,string>,int>

1
2
3
4
5
6
cout << endl << "Items frequencies: " << endl;
        
        for( map<string,int>::const_iterator it = item_count.begin(); it != item_count.end(); ++it ) {
            
            cout << it->first << ":" << it->second << endl;
            

This is my attempt

1
2
3
4
5
cout << endl << "Association Rules: " << endl;
        
        for( map<pair<string,string>,int>::const_iterator it = item_count.begin(); it != item_count.end(); ++it ) {
            
            cout << it->first << ":" << it->second << endl;
My code doesn't look remotely like yours, so it won't be direct help. However, here are the relevant bits which you can probably interpret. I don't use iterators anywhere.

I have a class for a single transaction, with member data holding all the values:
1
2
3
4
5
6
7
class Transaction
{
public:
   int id;
   int numItems;
   vector<string> data;
};


Once all the transaction are read (and their internal data sorted) I run it through a processing routine to build the maps:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void process( const vector<Transaction> &dataset, map<string,int> &frequency, map<pair<string,string>,int> &pairedFrequency )
{
   for ( Transaction trans : dataset )
   {
      // Include in single-item frequency table
      for ( string s : trans.data ) frequency[s]++;

      // Include in paired-item frequency table
      for ( int i = 0; i < trans.data.size() - 1; i++ )
      {
         for ( int j = i + 1; j < trans.data.size(); j++ )
         {
             pairedFrequency[ { trans.data[i], trans.data[j] } ]++;
         }
      }
   }
}


The frequency and pairedFrequency maps can then be used to calculate associations and, if they exceed the threshold, dumped in a separate vector of associations (another of my classes) for sorting and output.

Note that the pairs in my pairedFrequency map are in sorted order (because I sort the internal data in my transactions when I read it). When I am calculating the associations later I have to consider both orders.


Have you quizzed your teacher/lecturer/professor yet?
Last edited on
I did some major tweaks to my code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
int main()
{
    list<ITEM_COUNT      > wordList; // lists of items and their counts
    list<ITEMS_PAIR_COUNT> pairList; // list of pairs with counts
    int nTrans = readFile(wordList,pairList);
    
    string str;
    int nType;
    float threshold = 0.;
    
    while(true)
    {
        cout << "THIS PROGRAM WILL COMPUTE ALL ASSOCIATION RULES in the form A->B" << endl << endl;
        cout << "  Choose the threshold computation method:" << endl;
        cout << "  Enter 1 for #(A,B)/#A or 2 for (#(A,B)/#A)/#transaction" << endl << endl;
        cout << "  Enter E to exit: " << endl;
        cin  >> str;
        
        if(str == "1")
            nType = 1;
        else if(str == "2")
            nType = 2;
        else if(str == "E")
            break;
        
        if(str == "1" || str == "2")
        {
            cout << "Enter the threshold value: ";
            cin  >> threshold;
            
            showPairs(wordList,pairList,nType,nTrans,threshold);
            
            system("pause");
            system("cls");
        }
    }
    
    return 0;
}
Topic archived. No new replies allowed.