2.用完成例程方式實現(xiàn)的重疊I/O模型
#include <WINSOCK2.H>
#include <stdio.h>
#define PORT 5150
#define MSGSIZE 1024
#pragma comment(lib, "ws2_32.lib")
typedef struct
{
WSAOVERLAPPED overlap;
WSABUF Buffer;
char szMessage[MSGSIZE];
DWORD NumberOfBytesRecvd;
DWORD Flags;
SOCKET sClient;
}PER_IO_OPERATION_DATA, *LPPER_IO_OPERATION_DATA;
DWORD WINAPI WorkerThread(LPVOID);
void CALLBACK CompletionROUTINE(DWORD, DWORD, LPWSAOVERLAPPED, DWORD);
SOCKET g_sNewClientConnection;
BOOL g_bNewConnectionArrived = FALSE;
int main()
{
WSADATA wsaData;
SOCKET sListen;
SOCKADDR_IN local, client;
DWORD dwThreadId;
int iaddrSize = sizeof(SOCKADDR_IN);
// Initialize Windows Socket library
WSAStartup(0x0202, &wsaData);
// Create listening socket
sListen = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
// Bind
local.sin_addr.S_un.S_addr = htonl(INADDR_ANY);
local.sin_family = AF_INET;
local.sin_port = htons(PORT);
bind(sListen, (struct sockaddr *)&local, sizeof(SOCKADDR_IN));
// Listen
listen(sListen, 3);
// Create worker thread
CreateThread(NULL, 0, WorkerThread, NULL, 0, &dwThreadId);
while (TRUE)
{
// Accept a connection
g_sNewClientConnection = accept(sListen, (struct sockaddr *)&client, &iaddrSize);
g_bNewConnectionArrived = TRUE;
printf("Accepted client:%s:%d\n", inet_ntoa(client.sin_addr), ntohs(client.sin_port));
}
}
DWORD WINAPI WorkerThread(LPVOID lpParam)
{
LPPER_IO_OPERATION_DATA lpPerIOData = NULL;
while (TRUE)
{
if (g_bNewConnectionArrived)
{
// Launch an asynchronous operation for new arrived connection
lpPerIOData = (LPPER_IO_OPERATION_DATA)HeapAlloc(
GetProcessHeap(),
HEAP_ZERO_MEMORY,
sizeof(PER_IO_OPERATION_DATA));
lpPerIOData->Buffer.len = MSGSIZE;
lpPerIOData->Buffer.buf = lpPerIOData->szMessage;
lpPerIOData->sClient = g_sNewClientConnection;
WSARecv(lpPerIOData->sClient,
&lpPerIOData->Buffer,
1,
&lpPerIOData->NumberOfBytesRecvd,
&lpPerIOData->Flags,
&lpPerIOData->overlap,
CompletionROUTINE);
g_bNewConnectionArrived = FALSE;
}
SleepEx(1000, TRUE);
}
return 0;
}
void CALLBACK CompletionROUTINE(DWORD dwError,
DWORD cbTransferred,
LPWSAOVERLAPPED lpOverlapped,
DWORD dwFlags)
{
LPPER_IO_OPERATION_DATA lpPerIOData = (LPPER_IO_OPERATION_DATA)lpOverlapped;
if (dwError != 0 || cbTransferred == 0)
{
// Connection was closed by client
closesocket(lpPerIOData->sClient);
HeapFree(GetProcessHeap(), 0, lpPerIOData);
}
else
{
lpPerIOData->szMessage[cbTransferred] = '\0';
send(lpPerIOData->sClient, lpPerIOData->szMessage, cbTransferred, 0);
// Launch another asynchronous operation
memset(&lpPerIOData->overlap, 0, sizeof(WSAOVERLAPPED));
lpPerIOData->Buffer.len = MSGSIZE;
lpPerIOData->Buffer.buf = lpPerIOData->szMessage;
WSARecv(lpPerIOData->sClient,
&lpPerIOData->Buffer,
1,
&lpPerIOData->NumberOfBytesRecvd,
&lpPerIOData->Flags,
&lpPerIOData->overlap,
CompletionROUTINE);
}
}
用完成例程來實現(xiàn)重疊I/O比用事件通知簡單得多。在這個模型中,主線程只用不停的接受連接即可;輔助線程判斷有沒有新的客戶端連接被建立,如果有,就為那個客戶端套接字激活一個異步的WSARecv操作,然后調(diào)用SleepEx使線程處于一種可警告的等待狀態(tài),以使得I/O完成后CompletionROUTINE可以被內(nèi)核調(diào)用。如果輔助線程不調(diào)用SleepEx,則內(nèi)核在完成一次I/O操作后,無法調(diào)用完成例程(因為完成例程的運行應(yīng)該和當(dāng)初激活WSARecv異步操作的代碼在同一個線程之內(nèi))。
完成例程內(nèi)的實現(xiàn)代碼比較簡單,它取出接收到的數(shù)據(jù),然后將數(shù)據(jù)原封不動的發(fā)送給客戶端,最后重新激活另一個WSARecv異步操作。注意,在這里用到了“尾隨數(shù)據(jù)”。我們在調(diào)用WSARecv的時候,參數(shù)lpOverlapped實際上指向一個比它大得多的結(jié)構(gòu)PER_IO_OPERATION_DATA,這個結(jié)構(gòu)除了WSAOVERLAPPED以外,還被我們附加了緩沖區(qū)的結(jié)構(gòu)信息,另外還包括客戶端套接字等重要的信息。這樣,在完成例程中通過參數(shù)lpOverlapped拿到的不僅僅是WSAOVERLAPPED結(jié)構(gòu),還有后邊尾隨的包含客戶端套接字和接收數(shù)據(jù)緩沖區(qū)等重要信息。這樣的C語言技巧在我后面介紹完成端口的時候還會使用到。
五.完成端口模型
“完成端口”模型是迄今為止最為復(fù)雜的一種I/O模型。然而,假若一個應(yīng)用程序同時需要管理為數(shù)眾多的套接字,那么采用這種模型,往往可以達(dá)到最佳的系統(tǒng)性能!但不幸的是,該模型只適用于Windows NT和Windows 2000操作系統(tǒng)。因其設(shè)計的復(fù)雜性,只有在你的應(yīng)用程序需要同時管理數(shù)百乃至上千個套接字的時候,而且希望隨著系統(tǒng)內(nèi)安裝的CPU數(shù)量的增多,應(yīng)用程序的性能也可以線性提升,才應(yīng)考慮采用“完成端口”模型。要記住的一個基本準(zhǔn)則是,假如要為Windows NT或Windows 2000開發(fā)高性能的服務(wù)器應(yīng)用,同時希望為大量套接字I/O請求提供服務(wù)(Web服務(wù)器便是這方面的典型例子),那么I/O完成端口模型便是最佳選擇!(節(jié)選自《Windows網(wǎng)絡(luò)編程》第八章)
完成端口模型是我最喜愛的一種模型。雖然其實現(xiàn)比較復(fù)雜(其實我覺得它的實現(xiàn)比用事件通知實現(xiàn)的重疊I/O簡單多了),但其效率是驚人的。我在T公司的時候曾經(jīng)幫同事寫過一個郵件服務(wù)器的性能測試程序,用的就是完成端口模型。結(jié)果表明,完成端口模型在多連接(成千上萬)的情況下,僅僅依靠一兩個輔助線程,就可以達(dá)到非常高的吞吐量。下面我還是從代碼說起:
#include <WINSOCK2.H>
#include <stdio.h>
#define PORT 5150
#define MSGSIZE 1024
#pragma comment(lib, "ws2_32.lib")
typedef enum
{
RECV_POSTED
}OPERATION_TYPE;
typedef struct
{
WSAOVERLAPPED overlap;
WSABUF Buffer;
char szMessage[MSGSIZE];
DWORD NumberOfBytesRecvd;
DWORD Flags;
OPERATION_TYPE OperationType;
}PER_IO_OPERATION_DATA, *LPPER_IO_OPERATION_DATA;
DWORD WINAPI WorkerThread(LPVOID);
int main()
{
WSADATA wsaData;
SOCKET sListen, sClient;
SOCKADDR_IN local, client;
DWORD i, dwThreadId;
int iaddrSize = sizeof(SOCKADDR_IN);
HANDLE CompletionPort = INVALID_HANDLE_VALUE;
SYSTEM_INFO systeminfo;
LPPER_IO_OPERATION_DATA lpPerIOData = NULL;
// Initialize Windows Socket library
WSAStartup(0x0202, &wsaData);
// Create completion port
CompletionPort = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
// Create worker thread
GetSystemInfo(&systeminfo);
for (i = 0; i < systeminfo.dwNumberOfProcessors; i++)
{
CreateThread(NULL, 0, WorkerThread, CompletionPort, 0, &dwThreadId);
}
// Create listening socket
sListen = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
// Bind
local.sin_addr.S_un.S_addr = htonl(INADDR_ANY);
local.sin_family = AF_INET;
local.sin_port = htons(PORT);
bind(sListen, (struct sockaddr *)&local, sizeof(SOCKADDR_IN));
// Listen
listen(sListen, 3);
while (TRUE)
{
// Accept a connection
sClient = accept(sListen, (struct sockaddr *)&client, &iaddrSize);
printf("Accepted client:%s:%d\n", inet_ntoa(client.sin_addr), ntohs(client.sin_port));
// Associate the newly arrived client socket with completion port
CreateIoCompletionPort((HANDLE)sClient, CompletionPort, (DWORD)sClient, 0);
// Launch an asynchronous operation for new arrived connection
lpPerIOData = (LPPER_IO_OPERATION_DATA)HeapAlloc(
GetProcessHeap(),
HEAP_ZERO_MEMORY,
sizeof(PER_IO_OPERATION_DATA));
lpPerIOData->Buffer.len = MSGSIZE;
lpPerIOData->Buffer.buf = lpPerIOData->szMessage;
lpPerIOData->OperationType = RECV_POSTED;
WSARecv(sClient,
&lpPerIOData->Buffer,
1,
&lpPerIOData->NumberOfBytesRecvd,
&lpPerIOData->Flags,
&lpPerIOData->overlap,
NULL);
}
PostQueuedCompletionStatus(CompletionPort, 0xFFFFFFFF, 0, NULL);
CloseHandle(CompletionPort);
closesocket(sListen);
WSACleanup();
return 0;
}
DWORD WINAPI WorkerThread(LPVOID CompletionPortID)
{
HANDLE CompletionPort=(HANDLE)CompletionPortID;
DWORD dwBytesTransferred;
SOCKET sClient;
LPPER_IO_OPERATION_DATA lpPerIOData = NULL;
while (TRUE)
{
GetQueuedCompletionStatus(
CompletionPort,
&dwBytesTransferred,
&sClient,
(LPOVERLAPPED *)&lpPerIOData,
INFINITE);
if (dwBytesTransferred == 0xFFFFFFFF)
{
return 0;
}
if (lpPerIOData->OperationType == RECV_POSTED)
{
if (dwBytesTransferred == 0)
{
// Connection was closed by client
closesocket(sClient);
HeapFree(GetProcessHeap(), 0, lpPerIOData);
}
else
{
lpPerIOData->szMessage[dwBytesTransferred] = '\0';
send(sClient, lpPerIOData->szMessage, dwBytesTransferred, 0);
// Launch another asynchronous operation for sClient
memset(lpPerIOData, 0, sizeof(PER_IO_OPERATION_DATA));
lpPerIOData->Buffer.len = MSGSIZE;
lpPerIOData->Buffer.buf = lpPerIOData->szMessage;
lpPerIOData->OperationType = RECV_POSTED;
WSARecv(sClient,
&lpPerIOData->Buffer,
1,
&lpPerIOData->NumberOfBytesRecvd,
&lpPerIOData->Flags,
&lpPerIOData->overlap,
NULL);
}
}
}
return 0;
}
首先,說說主線程:
1.創(chuàng)建完成端口對象
2.創(chuàng)建工作者線程(這里工作者線程的數(shù)量是按照CPU的個數(shù)來決定的,這樣可以達(dá)到最佳性能)
3.創(chuàng)建監(jiān)聽套接字,綁定,監(jiān)聽,然后程序進(jìn)入循環(huán)
4.在循環(huán)中,我做了以下幾件事情:
(1).接受一個客戶端連接
(2).將該客戶端套接字與完成端口綁定到一起(還是調(diào)用CreateIoCompletionPort,但這次的作用不同),注意,按道理來講,此時傳遞給CreateIoCompletionPort的第三個參數(shù)應(yīng)該是一個完成鍵,一般來講,程序都是傳遞一個單句柄數(shù)據(jù)結(jié)構(gòu)的地址,該單句柄數(shù)據(jù)包含了和該客戶端連接有關(guān)的信息,由于我們只關(guān)心套接字句柄,所以直接將套接字句柄作為完成鍵傳遞;
(3).觸發(fā)一個WSARecv異步調(diào)用,這次又用到了“尾隨數(shù)據(jù)”,使接收數(shù)據(jù)所用的緩沖區(qū)緊跟在WSAOVERLAPPED對象之后,此外,還有操作類型等重要信息。
在工作者線程的循環(huán)中,我們
1.調(diào)用GetQueuedCompletionStatus取得本次I/O的相關(guān)信息(例如套接字句柄、傳送的字節(jié)數(shù)、單I/O數(shù)據(jù)結(jié)構(gòu)的地址等等)
2.通過單I/O數(shù)據(jù)結(jié)構(gòu)找到接收數(shù)據(jù)緩沖區(qū),然后將數(shù)據(jù)原封不動的發(fā)送到客戶端
3.再次觸發(fā)一個WSARecv異步操作
六.五種I/O模型的比較
我會從以下幾個方面來進(jìn)行比較
*有無每線程64連接數(shù)限制
如果在選擇模型中沒有重新定義FD_SETSIZE宏,則每個fd_set默認(rèn)可以裝下64個SOCKET。同樣的,受MAXIMUM_WAIT_OBJECTS宏的影響,事件選擇、用事件通知實現(xiàn)的重疊I/O都有每線程最大64連接數(shù)限制。如果連接數(shù)成千上萬,則必須對客戶端套接字進(jìn)行分組,這樣,勢必增加程序的復(fù)雜度。
相反,異步選擇、用完成例程實現(xiàn)的重疊I/O和完成端口不受此限制。
*線程數(shù)
除了異步選擇以外,其他模型至少需要2個線程。一個主線程和一個輔助線程。同樣的,如果連接數(shù)大于64,則選擇模型、事件選擇和用事件通知實現(xiàn)的重疊I/O的線程數(shù)還要增加。
*實現(xiàn)的復(fù)雜度
我的個人看法是,在實現(xiàn)難度上,異步選擇<選擇<用完成例程實現(xiàn)的重疊I/O<事件選擇<完成端口<用事件通知實現(xiàn)的重疊I/O
*性能
由于選擇模型中每次都要重設(shè)讀集,在select函數(shù)返回后還要針對所有套接字進(jìn)行逐一測試,我的感覺是效率比較差;完成端口和用完成例程實現(xiàn)的重疊I/O基本上不涉及全局?jǐn)?shù)據(jù),效率應(yīng)該是最高的,而且在多處理器情形下完成端口還要高一些;事件選擇和用事件通知實現(xiàn)的重疊I/O在實現(xiàn)機(jī)制上都是采用WSAWaitForMultipleEvents,感覺效率差不多;至于異步選擇,不好比較。所以我的結(jié)論是:選擇<用事件通知實現(xiàn)的重疊I/O<事件選擇<用完成例程實現(xiàn)的重疊I/O<完成端口