Last active
March 18, 2021 23:30
-
-
Save ppwwyyxx/4c3527f9da8b0f49a8aaee5dcd734450 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #-*- coding: utf-8 -*- | |
| import torch | |
| from torch import nn | |
| from fvcore.nn import FlopCountAnalysis, flop_count_table | |
| from pypapi import events, papi_high as high | |
| def main(): | |
| model = nn.Conv2d(256, 128, 3, padding=1) | |
| model.cpu() | |
| model.double() | |
| model.eval() | |
| print("PAPI, theoretical") | |
| for bs in range(1, 13): | |
| input = torch.randn((bs, 256, 28, 28)).double() | |
| with torch.no_grad(): | |
| for evt in ['PAPI_DP_OPS']: | |
| high.start_counters([getattr(events, evt)]) | |
| _ = model(input) | |
| papi_flop = high.stop_counters()[0] / 1e9 | |
| flop = FlopCountAnalysis(model, input).total() / 1e9 | |
| flop *= 2 # different convention | |
| print(papi_flop, flop) | |
| if __name__ == '__main__': | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #-*- coding: utf-8 -*- | |
| #File: | |
| import argparse | |
| import torch | |
| from torch import nn | |
| from fvcore.nn import FlopCountAnalysis, flop_count_table, flop_count_str | |
| from pypapi import events, papi_high as high | |
| def main(): | |
| roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') | |
| roberta.cpu() | |
| roberta.double() | |
| roberta.eval() | |
| tgt_len = 50 | |
| tokens = roberta.encode('hi'*(tgt_len - 3)).unsqueeze(0).to( device=roberta.device) | |
| assert tokens.numel() == tgt_len | |
| class A(nn.Module): | |
| def forward(self, tokens): | |
| return self.m.extract_features(tokens) | |
| with torch.no_grad(): | |
| for evt in ['PAPI_DP_OPS']: | |
| high.start_counters([getattr(events, evt)]) | |
| features, _ = roberta.model.extract_features(tokens) | |
| papi_flops = high.stop_counters() | |
| print('total flops (papi, {})'.format(evt), papi_flops[0] / 1e9) | |
| model = A(); model.m = roberta.model | |
| flop = FlopCountAnalysis(model, tokens) | |
| print(flop_count_table(flop, max_depth=5, show_param_shapes=False)) | |
| print(flop_count_str(flop)) | |
| print("Total", flop.total() / 1e9) | |
| if __name__ == '__main__': | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| total flops (papi, PAPI_DP_OPS) 9.003876419 | |
| | module | #parameters | #flops | | |
| |:---------------------------------------------------------------|:--------------|:------------| | |
| | model.m | 0.356G | 17.9G | | |
| | m.encoder | 0.355G | 17.9G | | |
| | m.encoder.sentence_encoder | 0.354G | 15.2G | | |
| | m.encoder.sentence_encoder.embed_tokens | 51.5M | 0 | | |
| | m.encoder.sentence_encoder.embed_positions | 0.526M | 0 | | |
| | m.encoder.sentence_encoder.layernorm_embedding | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers | 0.302G | | | |
| | m.encoder.sentence_encoder.layers.0 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.0.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.0.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.0.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.0.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.0.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.1 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.1.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.1.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.1.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.1.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.1.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.2 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.2.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.2.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.2.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.2.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.2.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.3 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.3.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.3.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.3.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.3.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.3.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.4 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.4.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.4.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.4.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.4.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.4.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.5 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.5.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.5.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.5.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.5.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.5.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.6 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.6.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.6.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.6.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.6.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.6.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.7 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.7.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.7.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.7.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.7.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.7.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.8 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.8.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.8.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.8.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.8.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.8.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.9 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.9.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.9.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.9.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.9.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.9.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.10 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.10.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.10.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.10.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.10.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.10.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.11 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.11.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.11.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.11.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.11.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.11.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.12 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.12.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.12.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.12.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.12.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.12.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.13 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.13.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.13.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.13.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.13.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.13.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.14 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.14.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.14.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.14.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.14.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.14.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.15 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.15.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.15.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.15.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.15.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.15.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.16 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.16.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.16.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.16.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.16.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.16.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.17 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.17.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.17.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.17.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.17.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.17.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.18 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.18.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.18.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.18.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.18.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.18.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.19 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.19.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.19.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.19.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.19.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.19.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.20 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.20.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.20.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.20.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.20.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.20.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.21 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.21.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.21.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.21.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.21.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.21.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.22 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.22.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.22.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.22.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.22.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.22.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.23 | 12.6M | 0.635G | | |
| | m.encoder.sentence_encoder.layers.23.self_attn | 4.2M | 0.215G | | |
| | m.encoder.sentence_encoder.layers.23.self_attn_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.sentence_encoder.layers.23.fc1 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.23.fc2 | 4.2M | 0.21G | | |
| | m.encoder.sentence_encoder.layers.23.final_layer_norm | 2.05K | 0.256M | | |
| | m.encoder.lm_head | 1.1M | 2.63G | | |
| | m.encoder.lm_head.dense | 1.05M | 52.4M | | |
| | m.encoder.lm_head.layer_norm | 2.05K | 0.256M | | |
| | m.classification_heads.mnli | 1.05M | | | |
| | m.classification_heads.mnli.dense | 1.05M | | | |
| | m.classification_heads.mnli.out_proj | 3.08K | | | |
| Input sizes (torch.Tensor only): [[50]] | |
| N/A indicates a possibly missing statistic due to how the module was called. Missing values are still included in the parent's total. | |
| A( | |
| n_params: 0.356G, n_flops: 17.9G | |
| (m): RobertaModel( | |
| n_params: 0.356G, n_flops: 17.9G | |
| (encoder): RobertaEncoder( | |
| n_params: 0.355G, n_flops: 17.9G | |
| (sentence_encoder): TransformerEncoder( | |
| n_params: 0.354G, n_flops: 15.2G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (embed_tokens): Embedding( | |
| 50265, 1024, padding_idx=1 | |
| n_params: 51.5M, n_flops: 0 | |
| ) | |
| (embed_positions): LearnedPositionalEmbedding( | |
| 514, 1024, padding_idx=1 | |
| n_params: 0.526M, n_flops: 0 | |
| ) | |
| (layernorm_embedding): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (layers): ModuleList( | |
| n_params: 0.302G, n_flops: N/A | |
| (0): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (1): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (2): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (3): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (4): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (5): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (6): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (7): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (8): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (9): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (10): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (11): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (12): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (13): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (14): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (15): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (16): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (17): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (18): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (19): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (20): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (21): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (22): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| (23): TransformerEncoderLayer( | |
| n_params: 12.6M, n_flops: 0.635G | |
| (self_attn): MultiheadAttention( | |
| n_params: 4.2M, n_flops: 0.215G | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (k_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (v_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (q_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| ) | |
| (self_attn_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| (dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (activation_dropout_module): FairseqDropout(n_params: 0, n_flops: N/A) | |
| (fc1): Linear( | |
| in_features=1024, out_features=4096, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (fc2): Linear( | |
| in_features=4096, out_features=1024, bias=True | |
| n_params: 4.2M, n_flops: 0.21G | |
| ) | |
| (final_layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| ) | |
| ) | |
| (lm_head): RobertaLMHead( | |
| n_params: 1.1M, n_flops: 2.63G | |
| (dense): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: 52.4M | |
| ) | |
| (layer_norm): LayerNorm( | |
| (1024,), eps=1e-05, elementwise_affine=True | |
| n_params: 2.05K, n_flops: 0.256M | |
| ) | |
| ) | |
| ) | |
| (classification_heads): ModuleDict( | |
| n_params: 1.05M, n_flops: N/A | |
| (mnli): RobertaClassificationHead( | |
| n_params: 1.05M, n_flops: N/A | |
| (dense): Linear( | |
| in_features=1024, out_features=1024, bias=True | |
| n_params: 1.05M, n_flops: N/A | |
| ) | |
| (dropout): Dropout( | |
| p=0.3, inplace=False | |
| n_params: 0, n_flops: N/A | |
| ) | |
| (out_proj): Linear( | |
| in_features=1024, out_features=3, bias=True | |
| n_params: 3.08K, n_flops: N/A | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| Total 17.8611712 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
test-conv-flop.py prints: