LinuxのデフォルトのTCPのウインドウサイズが小さすぎるので、ちょっと調べた。
まずデフォルトはどうなっているのかというと、Window Sizeに関連するパラメータはRHEL5では以下の状態。
net.ipv4.tcp_rmem = 4096 87380 4194304
net.ipv4.tcp_wmem = 4096 16384 4194304
net.core.rmem_max = 131071
net.core.wmem_max = 131071
最適な値はそれぞれのシステムによって異なるわけだけれども、TCP Tuning Guide – Linux TCP Tuningを見て、上記4つと、ついでにBIC TCP関連もチューニング。
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
net.ipv4.tcp_no_metrics_save = 1
net.core.netdev_max_backlog = 2500
congestion control(輻輳制御)はスループットを計測しながらチューンしないとダメっぽいね…。
net.ipv4.tcp_congestion_control=htcp
このWindow Sizeの初期値はkernelのどのあたりにあるかというと、linux-2.6.9/net/ipv4/tcp_output.c。
159 void tcp_select_initial_window(int __space, __u32 mss,
160 __u32 *rcv_wnd, __u32 *window_clamp,
161 int wscale_ok, __u8 *rcv_wscale)
162 {
163 unsigned int space = (__space < 0 ? 0 : __space);
164
165 /* If no clamp set the clamp to the max possible scaled window */
166 if (*window_clamp == 0)
167 (*window_clamp) = (65535 << 14);
168 space = min(*window_clamp, space);
169
170 /* Quantize space offering to a multiple of mss if possible. */
171 if (space > mss)
172 space = (space / mss) * mss;
173
174 /* NOTE: offering an initial window larger than 32767
175 * will break some buggy TCP stacks. We try to be nice.
176 * If we are not window scaling, then this truncates
177 * our initial window offering to 32k. There should also
178 * be a sysctl option to stop being nice.
179 */
180 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
181 (*rcv_wscale) = 0;
182 if (wscale_ok) {
183 /* Set window scaling on max possible window
184 * See RFC1323 for an explanation of the limit to 14
185 */
186 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
187 while (space > 65535 && (*rcv_wscale) < 14) {
188 space >>= 1;
189 (*rcv_wscale)++;
190 }
191 }
192
193 /* Set initial window to value enough for senders,
194 * following RFC1414. Senders, not following this RFC,
195 * will be satisfied with 2.
196 */
197 if (mss > (1<<*rcv_wscale)) {
198 int init_cwnd = 4;
199 if (mss > 1460*3)
200 init_cwnd = 2;
201 else if (mss > 1460)
202 init_cwnd = 3;
203 if (*rcv_wnd > init_cwnd*mss)
204 *rcv_wnd = init_cwnd*mss;
205 }
206
207 /* Set the clamp no higher than max representable value */
208 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
209 }
linux-2.6.18/net/ipv4/tcp_output.c。
170 void tcp_select_initial_window(int __space, __u32 mss,
171 __u32 *rcv_wnd, __u32 *window_clamp,
172 int wscale_ok, __u8 *rcv_wscale)
173 {
174 unsigned int space = (__space < 0 ? 0 : __space);
175
176 /* If no clamp set the clamp to the max possible scaled window */
177 if (*window_clamp == 0)
178 (*window_clamp) = (65535 << 14);
179 space = min(*window_clamp, space);
180
181 /* Quantize space offering to a multiple of mss if possible. */
182 if (space > mss)
183 space = (space / mss) * mss;
184
185 /* NOTE: offering an initial window larger than 32767
186 * will break some buggy TCP stacks. If the admin tells us
187 * it is likely we could be speaking with such a buggy stack
188 * we will truncate our initial window offering to 32K-1
189 * unless the remote has sent us a window scaling option,
190 * which we interpret as a sign the remote TCP is not
191 * misinterpreting the window field as a signed quantity.
192 */
193 if (sysctl_tcp_workaround_signed_windows)
194 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
195 else
196 (*rcv_wnd) = space;
197
198 (*rcv_wscale) = 0;
199 if (wscale_ok) {
200 /* Set window scaling on max possible window
201 * See RFC1323 for an explanation of the limit to 14
202 */
203 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
204 space = min_t(u32, space, *window_clamp);
205 while (space > 65535 && (*rcv_wscale) < 14) {
206 space >>= 1;
207 (*rcv_wscale)++;
208 }
209 }
210
211 /* Set initial window to value enough for senders,
212 * following RFC2414. Senders, not following this RFC,
213 * will be satisfied with 2.
214 */
215 if (mss > (1<<*rcv_wscale)) {
216 int init_cwnd = 4;
217 if (mss > 1460*3)
218 init_cwnd = 2;
219 else if (mss > 1460)
220 init_cwnd = 3;
221 if (*rcv_wnd > init_cwnd*mss)
222 *rcv_wnd = init_cwnd*mss;
223 }
224
225 /* Set the clamp no higher than max representable value */
226 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
227 }
RHEL4ではnet.ipv4.tcp_window_scaling = 0の際には、Window Sizeは32Kでキャップされていて、少なくともRHEL2.1のGA版でもlinux/include/net/tcp.hにはこのMAX_TCP_WINDOWが定義されている。
要するに古いネットワーク機器のTCP/IPスタックの実装者が、TCPヘッダのWindowを16bits = Singed Intってやっちゃった、つまり15bitだから32767になってるのがあるって話。
265 /*
266 * Never offer a window over 32767 without using window scaling. Some
267 * poor stacks do signed 16bit maths!
268 */
269 #define MAX_TCP_WINDOW 32767
RHEL4のlinux-2.6.9/include/net/tcp.hではご丁寧にUnsignedと追加されている。
382 /*
383 * Never offer a window over 32767 without using window scaling. Some
384 * poor stacks do signed 16bit maths!
385 */
386 #define MAX_TCP_WINDOW 32767U
これはRHEL5のlinux-2.6.18/include/net/tcp.hでも同じ。
51 /*
52 * Never offer a window over 32767 without using window scaling. Some
53 * poor stacks do signed 16bit maths!
54 */
55 #define MAX_TCP_WINDOW 32767U
diffを取ると、2.6.17で追加されたパッチが明確に。
diff 2.6.18_tcp_output.c 2.6.9_tcp_output.c
24,28c22
< if (sysctl_tcp_workaround_signed_windows)
< (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
< else
< (*rcv_wnd) = space;
<
---
> (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
コードからも明らかなように、sysctlに項目が追加されている。
sysctl -a | grep signed
net.ipv4.tcp_workaround_signed_windows = 0
参考