dbTalk Databases Forums  

Extremely slow when paging occurs

comp.databases.berkeley-db comp.databases.berkeley-db


Discuss Extremely slow when paging occurs in the comp.databases.berkeley-db forum.



Reply
 
Thread Tools Display Modes
  #1  
Old   
David
 
Posts: n/a

Default Extremely slow when paging occurs - 12-22-2005 , 10:55 AM






I am a newbie using BDB. I am to testing the performance of BDB when
paging occurs. My test scenario:

Machine: 4 G RAM and 4 CPUs
OS: Windows 2000 server
Cache size: 1.5 G
Rows: 10 M
Row size: 104 bytes (one int key column and 9 char[11] columns)

It took me 4 hours to insert 10M rows into a Btree DB. Inserting the
same amount of rows into MSSQL running on the same machine only took
800 seconds.

Any options/flags I can set to make it run faster?

My program:

=============
// test.cpp : Defines the entry point for the console application.
//


#include <db_cxx.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdio.h>
#include <map>

#ifdef _WIN32
extern "C" {
extern int getopt(int, char * const *, const char *);
extern char *optarg;
}
#else
#include <unistd.h>
#endif

using namespace std;
typedef struct _testRow
{
int rowNum;
char c1[11];
char c2[11];
char c3[11];
char c4[11];
char c5[11];
char c6[11];
char c7[11];
char c8[11];
char c9[11];
} testRow;
int
usage()
{
fprintf(stderr, "example_database_read: \n");
fprintf(stderr, " [-h <database home>, default ./ ]\n");
fprintf(stderr, " [-c <Cache size in Megabytes (50M - 1800M)>, default
200] \n");
fprintf(stderr, " [-p <Page size in Kilobytes (1K - 64K)>, default
4]\n");
fprintf(stderr, " [-r <Number of rows to insert (10000 -
100000000)>, default 1000000]\n");
fprintf(stderr, " [-t <table type: B - for Btree, H - for Hash >,
default Btree]\n");


fprintf(stderr,
"\nNote: Any path specified to the -h parameter must end\n");
fprintf(stderr, " with your system's path delimiter (/ or \\)\n");
return (-1);
}
char *db_home_dir = NULL;

int cacheSize = 200, pageSize = 4, totalRows = 1000000, cSize;
int noSync = 1;
DBTYPE tableType = DB_BTREE;
int bdb_write();
int bdb_read();
int map_write();
int map_read();
int main(int argc, char* argv[])
{
int ch, ret;
int cSize;
char type;


while ((ch = getopt(argc, argv, "h:c:r:t:?")) != EOF)
{
switch (ch)
{
case 'h':
if (optarg[strlen(optarg)-1] != '/' && optarg[strlen(optarg)-1] !=
'\\')
return (usage());
db_home_dir = optarg;
printf("DB home directory set to %s.\n", db_home_dir);
break;
case 'c':
cSize = atoi(optarg);
if (cSize > 50 && cSize < 1800)
cacheSize = cSize;
printf("Cache size set to %i Megabtyes.\n", cacheSize);
break;
case 'p':
cSize = atoi(optarg);
if (cSize >=1 && cSize <= 64)
pageSize = cSize;
printf("Page size set to %i Kilobtyes.\n", pageSize);
break;
case 'r':
cSize = atoi(optarg);
if (cSize > 10000 && cSize < 100000000)
totalRows = cSize;
printf("Total rows to insert set to %i.\n", totalRows);
break;
case 't':
type = optarg[0];
if (type == 'H')
tableType = DB_HASH;
break;

case '?':
default:
return (usage());
}
}
ret = bdb_write();
if (ret !=0 )
printf("bdb_write failed, ret = %i", ret);
ret = bdb_read();
if (ret !=0 )
printf("bdb_read failed, ret = %i", ret);
}

int bdb_write()
{
DB *dbp;

int ret = db_create(&dbp, NULL, 0);
if (ret != 0)
{
printf("db_create failed, ret = %i", ret);
return (ret);
}

ret = dbp->set_cachesize(dbp, 0, cacheSize * 1024 * 1024, 0);
if (ret != 0)
{
printf("set_cachesize failed, ret = %i", ret);
return (ret);
}
ret = dbp->set_pagesize(dbp, pageSize * 1024 );
if (ret != 0)
{
printf("set_pagesize failed, ret = %i", ret);
return (ret);
}
char dbFile[512];
if (db_home_dir != NULL)
sprintf(dbFile, "%stest.db", db_home_dir);
else
strcpy(dbFile, "./test.db");

ret = dbp->open(dbp, NULL, dbFile, NULL, tableType, DB_CREATE |
DB_TRUNCATE, 0);
if (ret != 0)
{
printf("DB open failed, ret = %i, Error msg = %s \n", ret,
db_strerror(ret));
return (ret);
}
time_t now;
now = time(0L);
printf("DB open for write at %s\n", ctime(&now));

int rowNum = 0;
DBT key, data;
testRow row;

memset(&key, 0, sizeof(DBT));
memset(&data, 0, sizeof(DBT));
memset (&row, 0, sizeof(testRow));
memcpy (row.c1, "AAAAAAAAAA", 10);
memcpy (row.c2, "AAAAAAAAAA", 10);
memcpy (row.c3, "AAAAAAAAAA", 10);
memcpy (row.c4, "AAAAAAAAAA", 10);
memcpy (row.c5, "AAAAAAAAAA", 10);
memcpy (row.c6, "AAAAAAAAAA", 10);
memcpy (row.c7, "AAAAAAAAAA", 10);
memcpy (row.c8, "AAAAAAAAAA", 10);
memcpy (row.c9, "AAAAAAAAAA", 10);
int size = data.size = sizeof(struct _testRow);


now = time(0L);
printf("Insert started at %s\n", ctime(&now));
for (rowNum = 0; rowNum < totalRows; rowNum++)
{
row.rowNum = rowNum;
key.data = &(row.rowNum);
key.size = 4;

/* Set up the database record's data */
data.data = &row;
data.size = size;

ret = dbp->put(dbp, NULL , &key, &data, 0);
if (ret != 0)
{
printf("DB put failed, ret = %i, Error msg = %s \n", ret,
db_strerror(ret));
return (ret);
}
}
now = time(0L);

printf("Insert completed at %s \n", ctime(&now));

if (noSync)
dbp->close(dbp, DB_NOSYNC);
now = time(0L);
printf("Database closed at %s \n", ctime(&now));
return 0;

}

int bdb_read()
{
DB *dbp;
int ret = db_create(&dbp, NULL, 0);
if (ret != 0)
{
printf("db_create failed, ret = %i", ret);
return (ret);
}

ret = dbp->set_cachesize(dbp, 0, cacheSize * 1024 * 1024, 0);
if (ret != 0)
{
printf("set_cachesize failed, ret = %i", ret);
return (ret);
}
ret = dbp->set_pagesize(dbp, pageSize * 1024 );
if (ret != 0)
{
printf("set_pagesize failed, ret = %i", ret);
return (ret);
}
char dbFile[512];
if (db_home_dir != NULL)
sprintf(dbFile, "%stest.db", db_home_dir);
else
strcpy(dbFile, "./test.db");
ret = dbp->open(dbp, NULL, dbFile, NULL, tableType, DB_RDONLY, 0);
if (ret != 0)
{
printf("DB open failed, ret = %i", ret);
return (ret);
}
time_t now;
now = time(0L);
printf("DB open for read at %s\n", ctime(&now));

int rowNum = 0;
DBT key, data;
testRow row;

memset(&key, 0, sizeof(DBT));
memset(&data, 0, sizeof(DBT));
/*
memcpy (row.c1, "AAAAAAAAAA", 10);
memcpy (row.c2, "AAAAAAAAAA", 10);
memcpy (row.c3, "AAAAAAAAAA", 10);
memcpy (row.c4, "AAAAAAAAAA", 10);
memcpy (row.c5, "AAAAAAAAAA", 10);
memcpy (row.c6, "AAAAAAAAAA", 10);
memcpy (row.c7, "AAAAAAAAAA", 10);
memcpy (row.c8, "AAAAAAAAAA", 10);
memcpy (row.c9, "AAAAAAAAAA", 10);
*/
int size = data.size = sizeof(struct _testRow);


now = time(0L);
printf("Read started at %s\n", ctime(&now));
for (rowNum = 0; rowNum < totalRows; rowNum++)
{
memset(&key, 0, sizeof(DBT));
memset(&data, 0, sizeof(DBT));
memset(&row, 0, sizeof(testRow));


key.data = &rowNum;
key.size = 4;
key.flags = DB_DBT_USERMEM;

/* Set up the database record's data */
data.data = &row;
data.size = size;
data.ulen = size;
data.flags = DB_DBT_USERMEM;

ret = dbp->get(dbp, NULL , &key, &data, 0);
if (ret != 0)
{
printf("DB get failed, ret = %i \n Error msg = %s", ret,
db_strerror(ret));
return (ret);
}
if (row.rowNum != rowNum)
{
printf("Got wrong result rowNum = %i and got %i", rowNum,
row.rowNum);
return -1;
}
}
now = time(0L);

printf("Read completed at %s \n", ctime(&now));

if (noSync)
dbp->close(dbp, DB_NOSYNC);
now = time(0L);
printf("Database closed at %s \n", ctime(&now));
return 0;

}

=======================

Thanks

David


Reply With Quote
  #2  
Old   
Patrick Schaaf
 
Posts: n/a

Default Re: Extremely slow when paging occurs - 12-22-2005 , 11:06 AM






Hi David,

Quote:
It took me 4 hours to insert 10M rows into a Btree DB.
This is the typical problem when using, like you do, a little endian
binary int as the btree key.

Please read http://dev.sleepycat.com/resources/faq_show.html?id=70

best regards
Patrick


Reply With Quote
  #3  
Old   
dave
 
Posts: n/a

Default Re: Extremely slow when paging occurs - 12-28-2005 , 02:44 PM



David,

Patrick's suggestion is the correct place to start.

You may also want to test a slightly larger cache, say 2GB, just in
case you are completely filling the cache with dirty pages. If that is
the case, then each newly allocated data page is going to require a
synchronous write to disk in order to flush an existing dirty page from
the cache. Useful information regarding cache size and Btree size
estimation can be found in the following sections of the Berkeley DB
Reference Guide:

* Selecting a cache size
http://www.sleepycat.com/xmldocs/ref/toc.html

* Disk space requirements
http://www.sleepycat.com/xmldocs/ref...diskspace.html

Regards,

Dave


Reply With Quote
  #4  
Old   
David
 
Posts: n/a

Default Re: Extremely slow when paging occurs - 01-06-2006 , 01:44 PM



Dave,

Thanks for you advice. Patrick's suggestion worked for my sorted input
case. The elapse time went down significantly.

However, for random input data, it slows down again.

Any suggestion on how to figure out the cache size to fill all dirty
pages?

If I have multiple DB writers writing to different DBs in one ENV, any
cache tuning guideline?

My test case:

I have a file with 10 Million records. Each record has 10 columns with
up to 20 bytes character. Values are very random. Assume all values are
different.

I want to write a program to find out distinct values for each column
and their count. Here is what I did:

- Open a env with cache 1.5G:

G_BDB_ENV = new DbEnv(0);
string dbHome = "G:/temp/bdb";
int ret = G_BDB_ENV->set_cachesize(1, 400*1024*1024, 1);
ret = G_BDB_ENV->open(dbHome.c_str(),
DB_INIT_CDB|DB_PRIVATE|DB_INIT_MPOOL|DB_CREATE|DB_ THREAD, 0);
- Open 10 Btree DBs, one for each column:
bdb = new Db(G_BDB_ENV, 0);
string fileName = string(RepConn::generateGUID()) + ".db";
bdb->set_pagesize(32*1024);
bdb->open(NULL, fileName.c_str(), NULL, DB_BTREE, DB_CREATE |
DB_THREAD, 0);
bdb->cursor(NULL, &cursor, DB_WRITECURSOR);
- For each value, look it up in its DB, if it exists, update the record
with its count+1. otherwise, insert a record with count =1.

Dbt key, data;
I4 row_count = 0;
memset(&key, 0, sizeof(DBT));
memset(&data, 0, sizeof(DBT));


key.set_data( value+4);
key.set_size( *(I4 *)value);

data.set_data(&row_count);
data.set_size(sizeof(I4));
data.set_ulen(sizeof(I4));
data.set_flags(DB_DBT_USERMEM);
int ret = m_cursor->get(&key, &data, DB_SET );
// 1.3 if not found insert a new record
if (ret != 0)
{
row_count = 1;
m_cursor->put(&key, &data, DB_KEYLAST);
m_distinct_count ++;
}
// 1.4 if found increase the row count
else
{
row_count++;
m_cursor->put(&key, &data, DB_CURRENT);
}

The performance was acceptable until it started to page. Then it is
very slow. Easily run for 24 hours.

Again my machine has 4 cpu and 4G RAM and nothing else is running.

Did I do anything wrong here? Or what can I do better? Do you think I
should be Hash DB instead?

Thanks

David


Reply With Quote
  #5  
Old   
David
 
Posts: n/a

Default Re: Extremely slow when paging occurs - 01-06-2006 , 02:01 PM



Hi Dave,

Thanks for your advise. I applied Patrick's suggestion and it worked
for sorted input.

Now I hit the same problem with random input. How do I figure out how
big the cache should be to fill all dirty pages? If I have multiple
writer writing to different DB in one ENV simultanously, any general
tuning guideline?

Let me put is this way, I need to compute distinct values and their
count for each column in a 100 Million rows file with 10 columns.
Assuming all values are different and evenly distributed.

That is what I did:

- Set the cache size for ENV to 1.5G due to the process limit of 2G.
- Open 10 Btree DBs for 10 columns, one each
- For each row, if a value of a column does not exist in its DB, insert
it with count 1. Otherwise update the existing record and increase the
count by 1.

The performance was acceptable until it starts to page. Then it is very
slow. Easily running for 24 hours. Again My machine has 4 CPUs and 4G
Ram and nothing else is running.

Any suggestions?

Thanks

David

dave wrote:
Quote:
David,

Patrick's suggestion is the correct place to start.

You may also want to test a slightly larger cache, say 2GB, just in
case you are completely filling the cache with dirty pages. If that is
the case, then each newly allocated data page is going to require a
synchronous write to disk in order to flush an existing dirty page from
the cache. Useful information regarding cache size and Btree size
estimation can be found in the following sections of the Berkeley DB
Reference Guide:

* Selecting a cache size
http://www.sleepycat.com/xmldocs/ref/toc.html

* Disk space requirements
http://www.sleepycat.com/xmldocs/ref...diskspace.html

Regards,

Dave


Reply With Quote
  #6  
Old   
David
 
Posts: n/a

Default Re: Extremely slow when paging occurs - 01-06-2006 , 05:41 PM



Thanks for your advices. Patrick's suggestion worked for my sorted
input case. The elapse time went down significantly to 15 Minutes for
50M rows.

However, for random input key records, it slows down again.

Any suggestion on how to figure out the cache size to fill all dirty
pages?

If I have multiple DB writers writing to different DBs in one ENV
concurrently, any cache tuning suggestion?

My test case:

I have a file with 10 Million records. Each record has 10 columns with
up to 20-byte characters. Values are generated by rand(). Assume all
values are different.

I want to write a program to find out distinct values for each column
and their count. Here is what I did:

- Open a env with cache 1.5G:

G_BDB_ENV = new DbEnv(0);
string dbHome = "G:/temp/bdb";
int ret = G_BDB_ENV->set_cachesize(1, 400*1024*1024, 1);
ret = G_BDB_ENV->open(dbHome.c_str(),
DB_INIT_CDB|DB_PRIVATE|DB_INIT_MPOOL|DB_CREATE|DB_ THREAD, 0);
- Open 10 Btree DBs, one for each column:
bdb = new Db(G_BDB_ENV, 0);
string fileName = string(RepConn::generateGUID()) + ".db";
bdb->set_pagesize(32*1024);
bdb->open(NULL, fileName.c_str(), NULL, DB_BTREE, DB_CREATE |
DB_THREAD, 0);
bdb->cursor(NULL, &cursor, DB_WRITECURSOR);

- For each value, look it up in its DB, if it exists, update the record
with its count+1. otherwise, insert a record with count =1.

Dbt key, data;
I4 row_count = 0;
memset(&key, 0, sizeof(DBT));
memset(&data, 0, sizeof(DBT));


key.set_data( value+4);
key.set_size( *(I4 *)value);

data.set_data(&row_count);
data.set_size(sizeof(I4));
data.set_ulen(sizeof(I4));
data.set_flags(DB_DBT_USERMEM);
int ret = m_cursor->get(&key, &data, DB_SET );
// 1.3 if not found insert a new record
if (ret != 0)
{
row_count = 1;
m_cursor->put(&key, &data, DB_KEYLAST);
m_distinct_count ++;
}
// 1.4 if found increase the row count
else
{
row_count++;
m_cursor->put(&key, &data, DB_CURRENT);
}

I have 4 threads running on my 4 CPU machine with 4G RAM.

The performance was acceptable until it started to page. Then it is
very slow. Easily run for 24 hours.

Did I do anything wrong here?

Thanks

David


Reply With Quote
Reply




Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

vB code is On
Smilies are On
[IMG] code is On
HTML code is Off



Powered by vBulletin Version 3.5.3
Copyright ©2000 - 2012, Jelsoft Enterprises Ltd.