TCP Window Size

LinuxのデフォルトのTCPのウインドウサイズが小さすぎるので、ちょっと調べた。

まずデフォルトはどうなっているのかというと、Window Sizeに関連するパラメータはRHEL5では以下の状態。

net.ipv4.tcp_rmem = 4096        87380   4194304
net.ipv4.tcp_wmem = 4096        16384   4194304
net.core.rmem_max = 131071
net.core.wmem_max = 131071

最適な値はそれぞれのシステムによって異なるわけだけれども、TCP Tuning Guide – Linux TCP Tuningを見て、上記4つと、ついでにBIC TCP関連もチューニング。

net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
net.ipv4.tcp_no_metrics_save = 1
net.core.netdev_max_backlog = 2500

congestion control(輻輳制御)はスループットを計測しながらチューンしないとダメっぽいね…。

net.ipv4.tcp_congestion_control=htcp

このWindow Sizeの初期値はkernelのどのあたりにあるかというと、linux-2.6.9/net/ipv4/tcp_output.c。

159 void tcp_select_initial_window(int __space, __u32 mss,
160                                __u32 *rcv_wnd, __u32 *window_clamp,
161                                int wscale_ok, __u8 *rcv_wscale)
162 {
163         unsigned int space = (__space < 0 ? 0 : __space);
164 
165         /* If no clamp set the clamp to the max possible scaled window */
166         if (*window_clamp == 0)
167                 (*window_clamp) = (65535 << 14);
168         space = min(*window_clamp, space);
169 
170         /* Quantize space offering to a multiple of mss if possible. */
171         if (space > mss)
172                 space = (space / mss) * mss;
173 
174         /* NOTE: offering an initial window larger than 32767
175          * will break some buggy TCP stacks. We try to be nice.
176          * If we are not window scaling, then this truncates
177          * our initial window offering to 32k. There should also
178          * be a sysctl option to stop being nice.
179          */
180         (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
181         (*rcv_wscale) = 0;
182         if (wscale_ok) {
183                 /* Set window scaling on max possible window
184                  * See RFC1323 for an explanation of the limit to 14 
185                  */
186                 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
187                 while (space > 65535 && (*rcv_wscale) < 14) {
188                         space >>= 1;
189                         (*rcv_wscale)++;
190                 }
191         }
192 
193         /* Set initial window to value enough for senders,
194          * following RFC1414. Senders, not following this RFC,
195          * will be satisfied with 2.
196          */
197         if (mss > (1<<*rcv_wscale)) {
198                 int init_cwnd = 4;
199                 if (mss > 1460*3)
200                         init_cwnd = 2;
201                 else if (mss > 1460)
202                         init_cwnd = 3;
203                 if (*rcv_wnd > init_cwnd*mss)
204                         *rcv_wnd = init_cwnd*mss;
205         }
206 
207         /* Set the clamp no higher than max representable value */
208         (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
209 }

linux-2.6.18/net/ipv4/tcp_output.c。

170 void tcp_select_initial_window(int __space, __u32 mss,
171                                __u32 *rcv_wnd, __u32 *window_clamp,
172                                int wscale_ok, __u8 *rcv_wscale)
173 {
174         unsigned int space = (__space < 0 ? 0 : __space);
175 
176         /* If no clamp set the clamp to the max possible scaled window */
177         if (*window_clamp == 0)
178                 (*window_clamp) = (65535 << 14);
179         space = min(*window_clamp, space);
180 
181         /* Quantize space offering to a multiple of mss if possible. */
182         if (space > mss)
183                 space = (space / mss) * mss;
184 
185         /* NOTE: offering an initial window larger than 32767
186          * will break some buggy TCP stacks. If the admin tells us
187          * it is likely we could be speaking with such a buggy stack
188          * we will truncate our initial window offering to 32K-1
189          * unless the remote has sent us a window scaling option,
190          * which we interpret as a sign the remote TCP is not
191          * misinterpreting the window field as a signed quantity.
192          */
193         if (sysctl_tcp_workaround_signed_windows)
194                 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
195         else
196                 (*rcv_wnd) = space;
197 
198         (*rcv_wscale) = 0;
199         if (wscale_ok) {
200                 /* Set window scaling on max possible window
201                  * See RFC1323 for an explanation of the limit to 14 
202                  */
203                 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
204                 space = min_t(u32, space, *window_clamp);
205                 while (space > 65535 && (*rcv_wscale) < 14) {
206                         space >>= 1;
207                         (*rcv_wscale)++;
208                 }
209         }
210 
211         /* Set initial window to value enough for senders,
212          * following RFC2414. Senders, not following this RFC,
213          * will be satisfied with 2.
214          */
215         if (mss > (1<<*rcv_wscale)) {
216                 int init_cwnd = 4;
217                 if (mss > 1460*3)
218                         init_cwnd = 2;
219                 else if (mss > 1460)
220                         init_cwnd = 3;
221                 if (*rcv_wnd > init_cwnd*mss)
222                         *rcv_wnd = init_cwnd*mss;
223         }
224 
225         /* Set the clamp no higher than max representable value */
226         (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
227 }

RHEL4ではnet.ipv4.tcp_window_scaling = 0の際には、Window Sizeは32Kでキャップされていて、少なくともRHEL2.1のGA版でもlinux/include/net/tcp.hにはこのMAX_TCP_WINDOWが定義されている。
要するに古いネットワーク機器のTCP/IPスタックの実装者が、TCPヘッダのWindowを16bits = Singed Intってやっちゃった、つまり15bitだから32767になってるのがあるって話。

265 /*
266  * Never offer a window over 32767 without using window scaling. Some
267  * poor stacks do signed 16bit maths!
268  */
269 #define MAX_TCP_WINDOW          32767

RHEL4のlinux-2.6.9/include/net/tcp.hではご丁寧にUnsignedと追加されている。

382 /* 
383  * Never offer a window over 32767 without using window scaling. Some
384  * poor stacks do signed 16bit maths! 
385  */
386 #define MAX_TCP_WINDOW          32767U

これはRHEL5のlinux-2.6.18/include/net/tcp.hでも同じ。

51 /* 
52  * Never offer a window over 32767 without using window scaling. Some
53  * poor stacks do signed 16bit maths! 
54  */
55 #define MAX_TCP_WINDOW          32767U

diffを取ると、2.6.17で追加されたパッチが明確に。

diff 2.6.18_tcp_output.c 2.6.9_tcp_output.c
24,28c22
<         if (sysctl_tcp_workaround_signed_windows)
<                 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
<         else
<                 (*rcv_wnd) = space;
< 
---
>         (*rcv_wnd) = min(space, MAX_TCP_WINDOW);

コードからも明らかなように、sysctlに項目が追加されている。

sysctl -a | grep signed
net.ipv4.tcp_workaround_signed_windows = 0

参考