Publications
2020 |
Udit Gupta, Young Geun Kim, Sylvia Lee, Jordan Tse, Hsien-Hsin S. Lee, Gu-Yeon Wei, David Brooks, Carole-Jean Wu Chasing Carbon: The Elusive Environmental Footprint of Computing Journal Article Forthcoming IEEE International Symposium on High-Performance Computer Architecture, 2021, Forthcoming. @article{Gupta2020c, title = {Chasing Carbon: The Elusive Environmental Footprint of Computing}, author = {Udit Gupta, Young Geun Kim, Sylvia Lee, Jordan Tse, Hsien-Hsin S. Lee, Gu-Yeon Wei, David Brooks, Carole-Jean Wu}, url = {https://arxiv.org/abs/2011.02839}, year = {2020}, date = {2020-11-06}, journal = {IEEE International Symposium on High-Performance Computer Architecture, 2021}, abstract = {Given recent algorithm, software, and hardware innovation, computing has enabled a plethora of new applications. As computing becomes increasingly ubiquitous, however, so does its environmental impact. This paper brings the issue to the attention of computer-systems researchers. Our analysis, built on industry-reported characterization, quantifies the environmental effects of computing in terms of carbon emissions. Broadly, carbon emissions have two sources: operational energy consumption, and hardware manufacturing and infrastructure. Although carbon emissions from the former are decreasing thanks to algorithmic, software, and hardware innovations that boost performance and power efficiency, the overall carbon footprint of computer systems continues to grow. This work quantifies the carbon output of computer systems to show that most emissions related to modern mobile and data-center equipment come from hardware manufacturing and infrastructure. We therefore outline future directions for minimizing the environmental impact of computing systems. }, keywords = {}, pubstate = {forthcoming}, tppubtype = {article} } Given recent algorithm, software, and hardware innovation, computing has enabled a plethora of new applications. As computing becomes increasingly ubiquitous, however, so does its environmental impact. This paper brings the issue to the attention of computer-systems researchers. Our analysis, built on industry-reported characterization, quantifies the environmental effects of computing in terms of carbon emissions. Broadly, carbon emissions have two sources: operational energy consumption, and hardware manufacturing and infrastructure. Although carbon emissions from the former are decreasing thanks to algorithmic, software, and hardware innovations that boost performance and power efficiency, the overall carbon footprint of computer systems continues to grow. This work quantifies the carbon output of computer systems to show that most emissions related to modern mobile and data-center equipment come from hardware manufacturing and infrastructure. We therefore outline future directions for minimizing the environmental impact of computing systems. |
Samuel Hsia, Udit Gupta, Mark Wilkening, Carole-Jean Wu, Gu-Yeon Wei, David Brooks Cross-Stack Workload Characterization of DeepRecommendation Systems Journal Article IEEE International Symposium on Workload Characterization, 2020. @article{Hsia2020, title = {Cross-Stack Workload Characterization of DeepRecommendation Systems}, author = {Samuel Hsia, Udit Gupta, Mark Wilkening, Carole-Jean Wu, Gu-Yeon Wei, David Brooks}, year = {2020}, date = {2020-10-27}, journal = {IEEE International Symposium on Workload Characterization}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Glenn G. Ko, Yuji Chai, Marco Donato, Paul N. Whatmough, Thierry Tambe, Rob A. Rutenbar, David Brooks, Gu-Yeon Wei A Scalable Bayesian Inference Accelerator for Unsupervised Learning Conference IEEE Hot Chips 31 Symposium, 2020. @conference{Ko2020b, title = {A Scalable Bayesian Inference Accelerator for Unsupervised Learning}, author = {Glenn G. Ko, Yuji Chai, Marco Donato, Paul N. Whatmough, Thierry Tambe, Rob A. Rutenbar, David Brooks, Gu-Yeon Wei}, year = {2020}, date = {2020-08-01}, booktitle = {IEEE Hot Chips 31 Symposium}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Thierry Tambe, En-Yu Yang, Zishen Wan, Yuntian Deng, Vijay Janapa Reddi, Alexander M. Rush, David Brooks, Gu-Yeon Wei Design Automation Conference (DAC 2020), 2020, (Best paper award). @conference{Tambe2020, title = {Algorithm-Hardware Co-Design of Adaptive Floating-Point Encodings for Resilient Deep Learning Inference}, author = {Thierry Tambe, En-Yu Yang, Zishen Wan, Yuntian Deng, Vijay Janapa Reddi, Alexander M. Rush, David Brooks, Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2020/07/B1743_030_5_1594671163.pdf}, year = {2020}, date = {2020-07-19}, publisher = {Design Automation Conference (DAC 2020)}, abstract = {Conventional hardware-friendly quantization methods, such asfixed-point or integer, tend to perform poorly at very low preci-sion as their shrunken dynamic ranges cannot adequately capturethe wide data distributions commonly seen in sequence transduc-tion models. We present an algorithm-hardware co-design centeredaround a novel floating-point inspired number format,AdaptivFloat,that dynamically maximizes and optimally clips its available dy-namic range, at a layer granularity, in order to create faithful encod-ings of neural network parameters. AdaptivFloat consistently pro-duces higher inference accuracies compared to block floating-point,uniform, IEEE-like float or posit encodings at low bit precision (≤8-bit) across a diverse set of state-of-the-art neural networks, ex-hibiting narrow to wide weight distribution. Notably, at 4-bit weightprecision, only a 2.1 degradation in BLEU score is observed on theAdaptivFloat-quantized Transformer network compared to totalaccuracy loss when encoded in the above-mentioned prominentdatatypes. Furthermore, experimental results on a deep neural net-work (DNN) processing element (PE), exploiting AdaptivFloat logicin its computational datapath, demonstrate per-operation energyand area that is 0.9×and 1.14×, respectively, that of an equivalentbit width NVDLA-like integer-based PE.}, note = {Best paper award}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Conventional hardware-friendly quantization methods, such asfixed-point or integer, tend to perform poorly at very low preci-sion as their shrunken dynamic ranges cannot adequately capturethe wide data distributions commonly seen in sequence transduc-tion models. We present an algorithm-hardware co-design centeredaround a novel floating-point inspired number format,AdaptivFloat,that dynamically maximizes and optimally clips its available dy-namic range, at a layer granularity, in order to create faithful encod-ings of neural network parameters. AdaptivFloat consistently pro-duces higher inference accuracies compared to block floating-point,uniform, IEEE-like float or posit encodings at low bit precision (≤8-bit) across a diverse set of state-of-the-art neural networks, ex-hibiting narrow to wide weight distribution. Notably, at 4-bit weightprecision, only a 2.1 degradation in BLEU score is observed on theAdaptivFloat-quantized Transformer network compared to totalaccuracy loss when encoded in the above-mentioned prominentdatatypes. Furthermore, experimental results on a deep neural net-work (DNN) processing element (PE), exploiting AdaptivFloat logicin its computational datapath, demonstrate per-operation energyand area that is 0.9×and 1.14×, respectively, that of an equivalentbit width NVDLA-like integer-based PE. |
Paul N. Whatmough, Marco Donato, Glenn G. Ko, Sae Kyu Lee, David Brooks, Gu-Yeon Wei CHIPKIT: An agile, reusable open-source framework for rapid test chip development Journal Article IEEE MICRO, 2020. @article{Whatmough2020b, title = {CHIPKIT: An agile, reusable open-source framework for rapid test chip development}, author = {Paul N. Whatmough, Marco Donato, Glenn G. Ko, Sae Kyu Lee, David Brooks, Gu-Yeon Wei}, year = {2020}, date = {2020-07-15}, journal = {IEEE MICRO}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Udit Gupta, Samuel Hsia, Vikram Saraph, Xiaodong Wang, Brandon Reagen, Gu-Yeon Wei, Hsien-Hsin S. Lee, David Brooks, Carole-Jean Wu DeepRecSys: A System for Optimizing End-To-End At-scale Neural Recommendation Inference Conference The 47th IEEE/ACM International Symposium on Computer Architecture (ISCA 2020), 2020. @conference{Gupta2020b, title = {DeepRecSys: A System for Optimizing End-To-End At-scale Neural Recommendation Inference}, author = {Udit Gupta, Samuel Hsia, Vikram Saraph, Xiaodong Wang, Brandon Reagen, Gu-Yeon Wei, Hsien-Hsin S. Lee, David Brooks, Carole-Jean Wu }, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2020/05/DeepRecSys_Gupta_ISCA2020.pdf}, year = {2020}, date = {2020-06-01}, publisher = {The 47th IEEE/ACM International Symposium on Computer Architecture (ISCA 2020)}, abstract = {Neural personalized recommendation is the corner-stone of a wide collection of cloud services and products, constituting significant compute demand of the cloud infrastructure. Thus, improving the execution efficiency of neural recommendation directly translates into infrastructure capacity saving. In this paper, we devise a novel end-to-end modeling infrastructure, DeepRecInfra, that adopts an algorithm and system co-design methodology to custom-design systems for recommendation use cases. Leveraging the insights from the recommendation characterization, a new dynamic scheduler, DeepRecSched, is proposed to maximize latency-bounded throughput by taking into account characteristics of inference query size and arrival patterns, recommendation model architectures, and underlying hardware systems. By doing so, system throughput is doubled across the eight industry-representative recommendation models. Finally, design, deployment, and evaluation in at-scale production datacenter shows over 30% latency reduction across a wide variety of recommendation models running on hundreds of machines.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Neural personalized recommendation is the corner-stone of a wide collection of cloud services and products, constituting significant compute demand of the cloud infrastructure. Thus, improving the execution efficiency of neural recommendation directly translates into infrastructure capacity saving. In this paper, we devise a novel end-to-end modeling infrastructure, DeepRecInfra, that adopts an algorithm and system co-design methodology to custom-design systems for recommendation use cases. Leveraging the insights from the recommendation characterization, a new dynamic scheduler, DeepRecSched, is proposed to maximize latency-bounded throughput by taking into account characteristics of inference query size and arrival patterns, recommendation model architectures, and underlying hardware systems. By doing so, system throughput is doubled across the eight industry-representative recommendation models. Finally, design, deployment, and evaluation in at-scale production datacenter shows over 30% latency reduction across a wide variety of recommendation models running on hundreds of machines. |
Liu Ke, Udit Gupta, Carole-Jean Wu, Benjamin Youngjae Cho, Mark Hempstead, Brandon Reagen, Xuan Zhang, David Brooks, Vikas Chandra, Utku Diril, Amin Firoozshahian, Kim Hazelwood, Bill Jia, Hsien-Hsin S. Lee, Meng Li, Bert Maher, Dheevatsa Mudigere, Maxim Naumov, Martin Schatz, Mikhail Smelyanskiy, Xiaodong Wang RecNMP: Accelerating Personalized Recommendation with Near-Memory Processing Conference The 47th IEEE/ACM International Symposium on Computer Architecture (ISCA 2020), 2020. @conference{Ke2020, title = {RecNMP: Accelerating Personalized Recommendation with Near-Memory Processing}, author = {Liu Ke, Udit Gupta, Carole-Jean Wu, Benjamin Youngjae Cho, Mark Hempstead, Brandon Reagen, Xuan Zhang, David Brooks, Vikas Chandra, Utku Diril, Amin Firoozshahian, Kim Hazelwood, Bill Jia, Hsien-Hsin S. Lee, Meng Li, Bert Maher, Dheevatsa Mudigere, Maxim Naumov, Martin Schatz, Mikhail Smelyanskiy, Xiaodong Wang}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2020/05/ISCA_2020_Near_Memory_Acceleration__Ke_.pdf}, year = {2020}, date = {2020-06-01}, publisher = {The 47th IEEE/ACM International Symposium on Computer Architecture (ISCA 2020)}, abstract = {Personalized recommendation systems leverage deep learning models and account for the majority of data center AI cycles. Their performance is dominated by memory-bound sparse embedding operations with unique irregular memory access patterns that pose a fundamental challenge to accelerate. This paper proposes a lightweight, commodity DRAM compliant, near-memory processing solution to accelerate personalized recommendation inference. The in-depth characterization of production-grade recommendation models shows that embedding operations with high model-, operator- and data-level parallelism lead to memory bandwidth saturation, limiting recommendation inference performance. We propose RecNMP which provides a scalable solution to improve system throughput, supporting a broad range of sparse embedding models. RecNMP is specifically tailored to production environments with heavy co-location of operators on a single server. Several hardware/software co-optimization techniques such as memory-side caching, table-aware packet scheduling, and hot entry profiling are studied, resulting in up to 9.8x memory latency speedup over a highly-optimized baseline. Overall, RecNMP offers 4.2x throughput improvement and 45.8% memory energy savings.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Personalized recommendation systems leverage deep learning models and account for the majority of data center AI cycles. Their performance is dominated by memory-bound sparse embedding operations with unique irregular memory access patterns that pose a fundamental challenge to accelerate. This paper proposes a lightweight, commodity DRAM compliant, near-memory processing solution to accelerate personalized recommendation inference. The in-depth characterization of production-grade recommendation models shows that embedding operations with high model-, operator- and data-level parallelism lead to memory bandwidth saturation, limiting recommendation inference performance. We propose RecNMP which provides a scalable solution to improve system throughput, supporting a broad range of sparse embedding models. RecNMP is specifically tailored to production environments with heavy co-location of operators on a single server. Several hardware/software co-optimization techniques such as memory-side caching, table-aware packet scheduling, and hot entry profiling are studied, resulting in up to 9.8x memory latency speedup over a highly-optimized baseline. Overall, RecNMP offers 4.2x throughput improvement and 45.8% memory energy savings. |
Glenn G. Ko, Yuji Chai, Marco Donato, Paul N. Whatmough, Thierry Tambe, Rob A. Rutenbar, David Brooks, Gu-Yeon Wei A 3mm2 Programmable Bayesian Inference Accelerator for Unsupervised Machine Perception using Parallel Gibbs Sampling in 16nm Conference IEEE Symposium on VLSI Circuits (VLSI), 2020. @conference{Ko2020, title = {A 3mm2 Programmable Bayesian Inference Accelerator for Unsupervised Machine Perception using Parallel Gibbs Sampling in 16nm}, author = {Glenn G. Ko, Yuji Chai, Marco Donato, Paul N. Whatmough, Thierry Tambe, Rob A. Rutenbar, David Brooks, Gu-Yeon Wei}, year = {2020}, date = {2020-06-01}, booktitle = {IEEE Symposium on VLSI Circuits (VLSI)}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Yu Emma Wang, Gu-Yeon Wei, David Brooks A Systematic Methodology for Analysis of Deep Learning Hardware and Software Platforms Conference Third Conference on Machine Learning and Systems (MLSys), 2020. @conference{Wang2020, title = {A Systematic Methodology for Analysis of Deep Learning Hardware and Software Platforms}, author = {Yu Emma Wang, Gu-Yeon Wei, David Brooks}, url = {https://proceedings.mlsys.org/papers/2020/12}, year = {2020}, date = {2020-03-02}, publisher = {Third Conference on Machine Learning and Systems (MLSys)}, abstract = {Training deep learning models is compute-intensive and there is an industry-wide trend towards hardware and software specialization to improve performance. To systematically compare deep learning systems, we introduce a methodology comprised of a set of analysis techniques and parameterized end-to-end models for fully connected, convolutional, and recurrent neural networks. This methodology can be applied to analyze various hardware and software systems, and is intended to complement traditional methods. We demonstrate its utility by comparing two generations of specialized platforms (Google's Cloud TPU v2/v3), three heterogeneous platforms (Google TPU, Nvidia GPU, and Intel CPU), and specialized software stacks (TensorFlow and CUDA).}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Training deep learning models is compute-intensive and there is an industry-wide trend towards hardware and software specialization to improve performance. To systematically compare deep learning systems, we introduce a methodology comprised of a set of analysis techniques and parameterized end-to-end models for fully connected, convolutional, and recurrent neural networks. This methodology can be applied to analyze various hardware and software systems, and is intended to complement traditional methods. We demonstrate its utility by comparing two generations of specialized platforms (Google's Cloud TPU v2/v3), three heterogeneous platforms (Google TPU, Nvidia GPU, and Intel CPU), and specialized software stacks (TensorFlow and CUDA). |
Udit Gupta, Carole-Jean Wu, Xiaodong Wang, Maxim Naumov, Brandon Reagen, David Brooks, Bradford Cottel, Kim Hazelwood, Bill Jia, Hsien-Hsin S. Lee, Andrey Malevich, Dheevatsa Mudigere, Mikhail Smelyanskiy, Liang Xiong, Xuan Zhang The Architectural Implications of Facebook's DNN-based Personalized Recommendation Conference The 26th IEEE International Symposium on High-Performance Computer Architecture, 2020. @conference{Gupta2020, title = {The Architectural Implications of Facebook's DNN-based Personalized Recommendation}, author = {Udit Gupta, Carole-Jean Wu, Xiaodong Wang, Maxim Naumov, Brandon Reagen, David Brooks, Bradford Cottel, Kim Hazelwood, Bill Jia, Hsien-Hsin S. Lee, Andrey Malevich, Dheevatsa Mudigere, Mikhail Smelyanskiy, Liang Xiong, Xuan Zhang}, url = {https://arxiv.org/abs/1906.03109 https://drive.google.com/file/d/1v5LbaizV4PK1WvroA5DMBtXwDVRONZ4j/view https://drive.google.com/file/d/1WJ32Cdv1qPLxVk2VQC_Dbmh30sUe5Cfe/view}, year = {2020}, date = {2020-03-01}, publisher = {The 26th IEEE International Symposium on High-Performance Computer Architecture}, abstract = {The widespread application of deep learning has changed the landscape of computation in the data center. In particular, personalized recommendation for content ranking is now largely accomplished leveraging deep neural networks. However, despite the importance of these models and the amount of compute cycles they consume, relatively little research attention has been devoted to systems for recommendation. To facilitate research and to advance the understanding of these workloads, this paper presents a set of real-world, production-scale DNNs for personalized recommendation coupled with relevant performance metrics for evaluation. In addition to releasing a set of open-source workloads, we conduct in-depth analysis that underpins future system design and optimization for at-scale recommendation: Inference latency varies by 60% across three Intel server generations, batching and co-location of inferences can drastically improve latency-bounded throughput, and the diverse composition of recommendation models leads to different optimization strategies.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The widespread application of deep learning has changed the landscape of computation in the data center. In particular, personalized recommendation for content ranking is now largely accomplished leveraging deep neural networks. However, despite the importance of these models and the amount of compute cycles they consume, relatively little research attention has been devoted to systems for recommendation. To facilitate research and to advance the understanding of these workloads, this paper presents a set of real-world, production-scale DNNs for personalized recommendation coupled with relevant performance metrics for evaluation. In addition to releasing a set of open-source workloads, we conduct in-depth analysis that underpins future system design and optimization for at-scale recommendation: Inference latency varies by 60% across three Intel server generations, batching and co-location of inferences can drastically improve latency-bounded throughput, and the diverse composition of recommendation models leads to different optimization strategies. |
Paul Whatmough, Marco Donato, Glenn Ko, David Brooks, Gu-Yeon Wei CHIPKIT: An agile, reusable open-source framework for rapid test chip development Unpublished 2020. @unpublished{Whatmough2020, title = {CHIPKIT: An agile, reusable open-source framework for rapid test chip development}, author = {Paul Whatmough, Marco Donato, Glenn Ko, David Brooks, Gu-Yeon Wei}, url = {https://arxiv.org/abs/2001.04504}, year = {2020}, date = {2020-01-13}, abstract = {The current trend for domain-specific architectures (DSAs) has led to renewed interest in research test chips to demonstrate new specialized hardware. Tape-outs also offer huge pedagogical value garnered from real hands-on exposure to the whole system stack. However, successful tape-outs demand hard-earned experience, and the design process is time consuming and fraught with challenges. Therefore, custom chips have remained the preserve of a small number of research groups, typically focused on circuit design research. This paper describes the CHIPKIT framework. We describe a reusable SoC subsystem which provides basic IO, an on-chip programmable host, memory and peripherals. This subsystem can be readily extended with new IP blocks to generate custom test chips. We also present an agile RTL development flow, including a code generation tool calledVGEN. Finally, we outline best practices for full-chip validation across the entire design cycle.}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } The current trend for domain-specific architectures (DSAs) has led to renewed interest in research test chips to demonstrate new specialized hardware. Tape-outs also offer huge pedagogical value garnered from real hands-on exposure to the whole system stack. However, successful tape-outs demand hard-earned experience, and the design process is time consuming and fraught with challenges. Therefore, custom chips have remained the preserve of a small number of research groups, typically focused on circuit design research. This paper describes the CHIPKIT framework. We describe a reusable SoC subsystem which provides basic IO, an on-chip programmable host, memory and peripherals. This subsystem can be readily extended with new IP blocks to generate custom test chips. We also present an agile RTL development flow, including a code generation tool calledVGEN. Finally, we outline best practices for full-chip validation across the entire design cycle. |
2019 |
Lillian Pentecost, Udit Gupta, Elisa Ngan, Gu-Yeon Wei, David Brooks, Johanna Beyer, Michael Behrisch CHAMPVis: Comparative Hierarchical Analysis of Microarchitectural Performance Workshop ProTools workshop co-located with Supercomputing, 2019. @workshop{Pentecost2019b, title = {CHAMPVis: Comparative Hierarchical Analysis of Microarchitectural Performance}, author = {Lillian Pentecost, Udit Gupta, Elisa Ngan, Gu-Yeon Wei, David Brooks, Johanna Beyer, Michael Behrisch}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2019/12/CHAMPVis2019.pdf}, year = {2019}, date = {2019-11-01}, booktitle = {ProTools workshop co-located with Supercomputing}, abstract = {Performance analysis and optimization are essential tasks for hardware and software engineers. In the age of datacenter-scale computing, it is particularly important to conduct comparative performance analysis to understand discrepancies and limitations among different hardware systems and applications. However, there is a distinct lack of productive visualization tools for these comparisons. We present CHAMPVis [1], a web-based, interactive visualization tool that leverages the hierarchical organization of hardware systems to enable productive performance analysis. With CHAMPVis, users can make definitive performance comparisons across applications or hardware platforms. In addition, CHAMPVis provides methods to rank and cluster based on performance metrics to identify common optimization opportunities. Our thorough task analysis reveals three types of datacenter-scale performance analysis tasks: summarization, detailed comparative analysis, and interactive performance bottleneck identification. We propose techniques for each class of tasks including (1) 1-D feature space projection for similarity analysis; (2) Hierarchical parallel coordinates for comparative analysis; and (3) User interactions for rapid diagnostic queries to identify optimization targets. We evaluate CHAMPVis by analyzing standard datacenter applications and machine learning benchmarks in two different case studies.}, keywords = {}, pubstate = {published}, tppubtype = {workshop} } Performance analysis and optimization are essential tasks for hardware and software engineers. In the age of datacenter-scale computing, it is particularly important to conduct comparative performance analysis to understand discrepancies and limitations among different hardware systems and applications. However, there is a distinct lack of productive visualization tools for these comparisons. We present CHAMPVis [1], a web-based, interactive visualization tool that leverages the hierarchical organization of hardware systems to enable productive performance analysis. With CHAMPVis, users can make definitive performance comparisons across applications or hardware platforms. In addition, CHAMPVis provides methods to rank and cluster based on performance metrics to identify common optimization opportunities. Our thorough task analysis reveals three types of datacenter-scale performance analysis tasks: summarization, detailed comparative analysis, and interactive performance bottleneck identification. We propose techniques for each class of tasks including (1) 1-D feature space projection for similarity analysis; (2) Hierarchical parallel coordinates for comparative analysis; and (3) User interactions for rapid diagnostic queries to identify optimization targets. We evaluate CHAMPVis by analyzing standard datacenter applications and machine learning benchmarks in two different case studies. |
Lillian Pentecost, Marco Donato, Brandon Reagen, Udit Gupta, Siming Ma, Gu-Yeon Wei, David Brooks IEEE/ACM International Symposium on Microarchitecture, 2019, ISBN: 978-1-4503-6938-1/19/10. @conference{Pentecost2019, title = {MaxNVM: Maximizing DNN Storage Density and Inference Efficiency with Sparse Encoding and Error Mitigation}, author = {Lillian Pentecost, Marco Donato, Brandon Reagen, Udit Gupta, Siming Ma, Gu-Yeon Wei, David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2019/12/MaxNVM2019.pdf https://dl.acm.org/citation.cfm?id=3358258}, isbn = {978-1-4503-6938-1/19/10}, year = {2019}, date = {2019-10-12}, booktitle = {IEEE/ACM International Symposium on Microarchitecture}, journal = {IEEE/ACM International Symposium on Microarchitecture}, abstract = {Deeply embedded applications require low-power, low-cost hardware that fits within stringent area constraints. Deep learning has many potential uses in these domains, but introduces significant inefficiencies stemming from off-chip DRAM accesses of model weights. Ideally, models would fit entirely on-chip. However, even with compression, memory requirements for state-of-the-art mod- els make on-chip inference impractical. Due to increased density, emerging eNVMs are one promising solution. We present MaxNVM, a principled co-design of sparse encodings, protective logic, and fault-prone MLC eNVM technologies (i.e.,RRAM and CTT) to enable highly-efficient DNN inference. We find bit reduction techniques (e.g., clustering and sparse compression) increase weight vulnerability to faults. This limits the capabilities of MLC eNVM. To circumvent this limitation, we improve storage den- sity (i.e., bits-per-cell) with minimal overhead using protective logic. Tradeoffs between density and reliability result in a rich design space. We show that by balancing these techniques, the weights of large networks are able to reasonably fit on-chip. Compared to a naive, single-level-cell eNVM solution, our highly-optimized MLC memory systems reduce weight area by up to 29×. We compare our technique against NVDLA, a state-of-the-art industry-grade CNN accelerator, and demonstrate up to 3.2× reduced power and up to 3.5× reduced energy per ResNet50 inference.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Deeply embedded applications require low-power, low-cost hardware that fits within stringent area constraints. Deep learning has many potential uses in these domains, but introduces significant inefficiencies stemming from off-chip DRAM accesses of model weights. Ideally, models would fit entirely on-chip. However, even with compression, memory requirements for state-of-the-art mod- els make on-chip inference impractical. Due to increased density, emerging eNVMs are one promising solution. We present MaxNVM, a principled co-design of sparse encodings, protective logic, and fault-prone MLC eNVM technologies (i.e.,RRAM and CTT) to enable highly-efficient DNN inference. We find bit reduction techniques (e.g., clustering and sparse compression) increase weight vulnerability to faults. This limits the capabilities of MLC eNVM. To circumvent this limitation, we improve storage den- sity (i.e., bits-per-cell) with minimal overhead using protective logic. Tradeoffs between density and reliability result in a rich design space. We show that by balancing these techniques, the weights of large networks are able to reasonably fit on-chip. Compared to a naive, single-level-cell eNVM solution, our highly-optimized MLC memory systems reduce weight area by up to 29×. We compare our technique against NVDLA, a state-of-the-art industry-grade CNN accelerator, and demonstrate up to 3.2× reduced power and up to 3.5× reduced energy per ResNet50 inference. |
Marco Donato, Lillian Pentecost, David Brooks, Gu-Yeon Wei MEMTI: Optimizing On-Chip Nonvolatile Storage for Visual Multitask Inference at the Edge Journal Article IEEE MICRO, 2019. @article{Donato2019, title = {MEMTI: Optimizing On-Chip Nonvolatile Storage for Visual Multitask Inference at the Edge}, author = {Marco Donato, Lillian Pentecost, David Brooks, Gu-Yeon Wei}, url = {https://ieeexplore.ieee.org/document/8859219}, year = {2019}, date = {2019-10-04}, journal = {IEEE MICRO}, abstract = {The combination of specialized hardware and embedded nonvolatile memories (eNVM) holds promise for energy-efficient deep neural network (DNN) inference at the edge. However, integrating DNN hardware accelerators with eNVMs still presents several challenges. Multilevel programming is desirable for achieving maximal storage density on chip, but the stochastic nature of eNVM writes makes them prone to errors and further increases the write energy and latency. In this article, we present MEMTI, a memory architecture that leverages a multitask learning technique for maximal reuse of DNN parameters across multiple visual tasks. We show that by retraining and updating only 10% of all DNN parameters, we can achieve efficient model adaptation across a variety of visual inference tasks. The system performance is evaluated by integrating the memory with the open-source NVIDIA deep learning architecture.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The combination of specialized hardware and embedded nonvolatile memories (eNVM) holds promise for energy-efficient deep neural network (DNN) inference at the edge. However, integrating DNN hardware accelerators with eNVMs still presents several challenges. Multilevel programming is desirable for achieving maximal storage density on chip, but the stochastic nature of eNVM writes makes them prone to errors and further increases the write energy and latency. In this article, we present MEMTI, a memory architecture that leverages a multitask learning technique for maximal reuse of DNN parameters across multiple visual tasks. We show that by retraining and updating only 10% of all DNN parameters, we can achieve efficient model adaptation across a variety of visual inference tasks. The system performance is evaluated by integrating the memory with the open-source NVIDIA deep learning architecture. |
Udit Gupta, Brandon Reagen, Lillian Pentecost, Marco Donato, Thierry Tambe, Alexander M. Rush, Gu-Yeon Wei, David Brooks MASR: A Modular Accelerator for Sparse RNNs Conference International Conference on Parallel Architectures and Compilation Techniques, 2019. @conference{Gupta2019, title = {MASR: A Modular Accelerator for Sparse RNNs}, author = {Udit Gupta, Brandon Reagen, Lillian Pentecost, Marco Donato, Thierry Tambe, Alexander M. Rush, Gu-Yeon Wei, David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2019/09/masr_pact.pdf}, year = {2019}, date = {2019-09-23}, booktitle = {International Conference on Parallel Architectures and Compilation Techniques}, abstract = {Recurrent neural networks (RNNs) are becoming the de facto solution for speech recognition. RNNs exploit long-term temporal relationships in data by applying repeated, learned transformations. Unlike fully-connected (FC) layers with single vector matrix operations, RNN layers consist of hundreds of such operations chained over time. This poses challenges unique to RNNs that are not found in convolutional neural networks (CNNs) or FC models, namely large dynamic activation. In this paper we present MASR, a principled and modular architecture that accelerates bidirectional RNNs for on-chip ASR. MASR is designed to exploit sparsity in both dynamic activations and static weights. The architecture is enhanced by a series of dynamic activation optimizations that enable compact storage, ensure no energy is wasted computing null operations, and maintain high MAC utilization for highly parallel accelerator designs. In comparison to current state-of-the-art sparse neural network accelerators (e.g., EIE), MASR provides 2x area 3x energy, and 1.6x performance benefits. The modular nature of MASR enables designs that efficiently scale from resource-constrained low-power IoT applications to large-scale, highly parallel datacenter deployments.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Recurrent neural networks (RNNs) are becoming the de facto solution for speech recognition. RNNs exploit long-term temporal relationships in data by applying repeated, learned transformations. Unlike fully-connected (FC) layers with single vector matrix operations, RNN layers consist of hundreds of such operations chained over time. This poses challenges unique to RNNs that are not found in convolutional neural networks (CNNs) or FC models, namely large dynamic activation. In this paper we present MASR, a principled and modular architecture that accelerates bidirectional RNNs for on-chip ASR. MASR is designed to exploit sparsity in both dynamic activations and static weights. The architecture is enhanced by a series of dynamic activation optimizations that enable compact storage, ensure no energy is wasted computing null operations, and maintain high MAC utilization for highly parallel accelerator designs. In comparison to current state-of-the-art sparse neural network accelerators (e.g., EIE), MASR provides 2x area 3x energy, and 1.6x performance benefits. The modular nature of MASR enables designs that efficiently scale from resource-constrained low-power IoT applications to large-scale, highly parallel datacenter deployments. |
Glenn G. Ko, Yuji Chai, Rob A. Rutenbar, David Brooks, Gu-Yeon Wei Accelerating Bayesian Inference on Structured Graphs Using Parallel Gibbs Sampling Proceeding International Conference on Field-Programmable Logic and Applications, 2019. @proceedings{Ko2019, title = {Accelerating Bayesian Inference on Structured Graphs Using Parallel Gibbs Sampling}, author = {Glenn G. Ko, Yuji Chai, Rob A. Rutenbar, David Brooks, Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2019/10/ko-fpl2019.pdf}, year = {2019}, date = {2019-09-01}, publisher = {International Conference on Field-Programmable Logic and Applications}, abstract = {Bayesian models and inference is a class of machine learning that is useful for solving problems where the amount of data is scarce and prior knowledge about the application allows you to draw better conclusions. However, Bayesian models often requires computing high-dimensional integrals and finding the posterior distribution can be intractable. One of the most commonly used approximate methods for Bayesian inference is Gibbs sampling, which is a Markov chain Monte Carlo (MCMC) technique to estimate target stationary distribution. The idea in Gibbs sampling is to generate posterior samples by iterating through each of the variables to sample from its conditional given all the other variables fixed. While Gibbs sampling is a popular method for probabilistic graphical models such as Markov Random Field (MRF), the plain algorithm is slow as it goes through each of the variables sequentially. In this work, we describe a binary label MRF Gibbs sampling inference architecture and extend it to 64-label version capable of running multiple perceptual applications, such as sound source separation and stereo matching. The described accelerator employs a chromatic scheduling of variables to parallelize all the conditionally independent variables to 257 samplers, imple- mented on the FPGA portion of a CPU-FPGA SoC. For real-time streaming sound source separation task, we show the hybrid CPU- FPGA implementation is 230x faster than a commercial mobile processor, while maintaining a recommended latency under 50 ms. The 64-label version showed 137x and 679x speedups for binary label MRF Gibbs sampling inference and 64 labels, respectively.}, keywords = {}, pubstate = {published}, tppubtype = {proceedings} } Bayesian models and inference is a class of machine learning that is useful for solving problems where the amount of data is scarce and prior knowledge about the application allows you to draw better conclusions. However, Bayesian models often requires computing high-dimensional integrals and finding the posterior distribution can be intractable. One of the most commonly used approximate methods for Bayesian inference is Gibbs sampling, which is a Markov chain Monte Carlo (MCMC) technique to estimate target stationary distribution. The idea in Gibbs sampling is to generate posterior samples by iterating through each of the variables to sample from its conditional given all the other variables fixed. While Gibbs sampling is a popular method for probabilistic graphical models such as Markov Random Field (MRF), the plain algorithm is slow as it goes through each of the variables sequentially. In this work, we describe a binary label MRF Gibbs sampling inference architecture and extend it to 64-label version capable of running multiple perceptual applications, such as sound source separation and stereo matching. The described accelerator employs a chromatic scheduling of variables to parallelize all the conditionally independent variables to 257 samplers, imple- mented on the FPGA portion of a CPU-FPGA SoC. For real-time streaming sound source separation task, we show the hybrid CPU- FPGA implementation is 230x faster than a commercial mobile processor, while maintaining a recommended latency under 50 ms. The 64-label version showed 137x and 679x speedups for binary label MRF Gibbs sampling inference and 64 labels, respectively. |
Brian Plancher, Camelia D. Brumar, Iulian Brumar, Lillian Pentecost, Saketh Rama, David Brooks Application of Approximate Matrix Multiplication to Neural Networks and Distributed SLAM Conference IEEE High Performance Extreme Computing Conference (HPEC), 2019. @conference{Plancher2019, title = {Application of Approximate Matrix Multiplication to Neural Networks and Distributed SLAM}, author = {Brian Plancher, Camelia D. Brumar, Iulian Brumar, Lillian Pentecost, Saketh Rama, David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2019/12/AppliedApproxMM2019.pdf}, year = {2019}, date = {2019-09-01}, booktitle = {IEEE High Performance Extreme Computing Conference (HPEC)}, abstract = {Computational efficiency is a critical constraint for a variety of cutting-edge real-time applications. In this work, we identify an opportunity to speed up the end-to-end runtime of two such compute bound applications by incorporating approximate linear algebra techniques. Particularly, we apply approximate matrix multiplication to artificial Neural Networks (NNs) for image classification and to the robotics problem of Distributed Simultaneous Localization and Mapping (DSLAM). Expanding upon recent sampling-based Monte Carlo approximation strategies for matrix multiplication, we develop updated theoretical bounds, and an adaptive error prediction strategy. We then apply these techniques in the context of NNs and DSLAM increasing the speed of both applications by 15-20% while maintaining a 97% classification accuracy for NNs running on the MNIST dataset and keeping the average robot position error under 1 meter (vs 0.32 meters for the exact solution). However, both applications experience variance in their results. This suggests that Monte Carlo matrix multiplication may be an effective technique to reduce the memory and computational burden of certain algorithms when used carefully, but more research is needed before these techniques can be widely used in practice. }, keywords = {}, pubstate = {published}, tppubtype = {conference} } Computational efficiency is a critical constraint for a variety of cutting-edge real-time applications. In this work, we identify an opportunity to speed up the end-to-end runtime of two such compute bound applications by incorporating approximate linear algebra techniques. Particularly, we apply approximate matrix multiplication to artificial Neural Networks (NNs) for image classification and to the robotics problem of Distributed Simultaneous Localization and Mapping (DSLAM). Expanding upon recent sampling-based Monte Carlo approximation strategies for matrix multiplication, we develop updated theoretical bounds, and an adaptive error prediction strategy. We then apply these techniques in the context of NNs and DSLAM increasing the speed of both applications by 15-20% while maintaining a 97% classification accuracy for NNs running on the MNIST dataset and keeping the average robot position error under 1 meter (vs 0.32 meters for the exact solution). However, both applications experience variance in their results. This suggests that Monte Carlo matrix multiplication may be an effective technique to reduce the memory and computational burden of certain algorithms when used carefully, but more research is needed before these techniques can be widely used in practice. |
Sae Kyu Lee, Paul Whatmough, David Brooks, Gu-Yeon Wei A 16-nm always-on DNN processor with adaptive clocking and multi-cycle banked SRAMs Journal Article IEEE Journal of Solid-State Circuits, 2019. @article{Lee2019, title = {A 16-nm always-on DNN processor with adaptive clocking and multi-cycle banked SRAMs}, author = {Sae Kyu Lee, Paul Whatmough, David Brooks, Gu-Yeon Wei}, url = {https://ieeexplore.ieee.org/document/8715387}, year = {2019}, date = {2019-07-01}, journal = {IEEE Journal of Solid-State Circuits}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Paul N. Whatmough; Sae Kyu Lee; Marco Donato; Hsea-Ching Hsueh; Sam Likun Xi ; Udit Gupta; Lillian Pentecost; Glenn G. Ko; David Brooks; Gu-Yeon Wei A 16nm 25mm2 SoC with a 54.5x Flexibility-Efficiency Range from Dual-Core Arm Cortex-A53 to eFPGA and Cache-Coherent Accelerators Journal Article Symposium on VLSI Circuits, 2019. @article{Whatmough2019, title = {A 16nm 25mm2 SoC with a 54.5x Flexibility-Efficiency Range from Dual-Core Arm Cortex-A53 to eFPGA and Cache-Coherent Accelerators}, author = {Paul N. Whatmough and Sae Kyu Lee and Marco Donato and Hsea-Ching Hsueh and Sam Likun Xi and Udit Gupta and Lillian Pentecost and Glenn G. Ko and David Brooks and Gu-Yeon Wei }, url = {https://ieeexplore.ieee.org/abstract/document/8778002/authors#authors}, year = {2019}, date = {2019-06-01}, journal = {Symposium on VLSI Circuits}, abstract = {This paper presents a 25mm^2 SoC in 16nm FinFET technology targeting flexible acceleration of compute intensive kernels in DNN, DSP and security algorithms. The SoC includes an always-on sub-system, a dual-core Arm A53 CPU cluster, an embedded FPGA array, and a quad-core cache-coherent accelerator cluster. Measurement results demonstrate the following observations: 1) moving DSP/cryptography kernels from A53 to eFPGA increases energy efficiency between 5.5× - 28.9×, 2) the use of cache coherency for datapath accelerators increases throughput by 2.94×, and 3) accelerator flexibility-efficiency (GOPS/W) range spans from 3.1× (A53+S1MD), to 16.5× (eFPGA), to 54.5× (CCA) compared to the dual-core CPU baseline on comparable tasks. The energy per inference on MobileNet-128 CNN shows a peak improvement of 47.6×. }, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper presents a 25mm^2 SoC in 16nm FinFET technology targeting flexible acceleration of compute intensive kernels in DNN, DSP and security algorithms. The SoC includes an always-on sub-system, a dual-core Arm A53 CPU cluster, an embedded FPGA array, and a quad-core cache-coherent accelerator cluster. Measurement results demonstrate the following observations: 1) moving DSP/cryptography kernels from A53 to eFPGA increases energy efficiency between 5.5× - 28.9×, 2) the use of cache coherency for datapath accelerators increases throughput by 2.94×, and 3) accelerator flexibility-efficiency (GOPS/W) range spans from 3.1× (A53+S1MD), to 16.5× (eFPGA), to 54.5× (CCA) compared to the dual-core CPU baseline on comparable tasks. The energy per inference on MobileNet-128 CNN shows a peak improvement of 47.6×. |
Yu Emma Wang; Yuhao Zhu; Glenn G. Ko; Brandon Reagen; Gu-Yeon Wei; David Brooks Demystifying Bayesian Inference Workloads Proceeding 2019. @proceedings{wangispass2019, title = {Demystifying Bayesian Inference Workloads}, author = {Yu Emma Wang and Yuhao Zhu and Glenn G. Ko and Brandon Reagen and Gu-Yeon Wei and David Brooks}, url = {https://yuemmawang.github.io/publications/wang-ispass2019.pdf}, year = {2019}, date = {2019-03-25}, booktitle = {International Symposium on Performance Analysis of Systems and Software (ISPASS)}, abstract = {The recent surge of machine learning has motivated computer architects to focus intently on accelerating related workloads, especially in deep learning. Deep learning has been the pillar algorithm that has led the advancement of learning patterns from a vast amount of labeled data, or supervised learning. However, for unsupervised learning, Bayesian methods often work better than deep learning. Bayesian modeling and inference works well with unlabeled or limited data, can leverage informative priors, and has interpretable models. Despite being an important branch of machine learning, Bayesian inference generally has been overlooked by the architecture and systems communities. In this paper, we facilitate the study of Bayesian inference with the development of BayesSuite, a collection of seminal Bayesian inference workloads. We characterize the power and performance profiles of BayesSuite across a variety of current-generation processors and find significant diversity. Manually tuning and deploying Bayesian inference workloads requires deep understanding of the workload characteristics and hardware specifications. To address these challenges and provide high-performance, energy-efficient support for Bayesian infer- ence, we introduce a scheduling and optimization mechanism that can be plugged into a system scheduler. We also propose a computation elision technique that further improves the performance and energy efficiency of the workloads by skipping computations that do not improve the quality of the inference. Our proposed techniques are able to increase Bayesian inference performance by 5.8× on average over the naive assignment and execution of the workloads.}, keywords = {}, pubstate = {published}, tppubtype = {proceedings} } The recent surge of machine learning has motivated computer architects to focus intently on accelerating related workloads, especially in deep learning. Deep learning has been the pillar algorithm that has led the advancement of learning patterns from a vast amount of labeled data, or supervised learning. However, for unsupervised learning, Bayesian methods often work better than deep learning. Bayesian modeling and inference works well with unlabeled or limited data, can leverage informative priors, and has interpretable models. Despite being an important branch of machine learning, Bayesian inference generally has been overlooked by the architecture and systems communities. In this paper, we facilitate the study of Bayesian inference with the development of BayesSuite, a collection of seminal Bayesian inference workloads. We characterize the power and performance profiles of BayesSuite across a variety of current-generation processors and find significant diversity. Manually tuning and deploying Bayesian inference workloads requires deep understanding of the workload characteristics and hardware specifications. To address these challenges and provide high-performance, energy-efficient support for Bayesian infer- ence, we introduce a scheduling and optimization mechanism that can be plugged into a system scheduler. We also propose a computation elision technique that further improves the performance and energy efficiency of the workloads by skipping computations that do not improve the quality of the inference. Our proposed techniques are able to increase Bayesian inference performance by 5.8× on average over the naive assignment and execution of the workloads. |
Yu Emma Wang; Victor Lee; Gu-Yeon Wei; David Brooks. Predicting New Workload or CPU Performance by Analyzing Public Datasets Journal Article ACM Transactions on Architecture and Code Optimization (TACO), 15 (4), pp. 53:1–53:21, 2019. @article{wangtaco2019, title = {Predicting New Workload or CPU Performance by Analyzing Public Datasets}, author = {Yu Emma Wang and Victor Lee and Gu-Yeon Wei and David Brooks.}, url = {https://yuemmawang.github.io/publications/wang-taco2019.pdf}, year = {2019}, date = {2019-01-05}, journal = {ACM Transactions on Architecture and Code Optimization (TACO)}, volume = {15}, number = {4}, pages = {53:1–53:21}, abstract = {The marketplace for general-purpose microprocessors offers hundreds of functionally similar models, differing by traits like frequency, core count, cache size, memory bandwidth, and power consumption. Their performance depends not only on microarchitecture, but also on the nature of the workloads being executed. Given a set of intended workloads, the consumer needs both performance and price information to make rational buying decisions. Many benchmark suites have been developed to measure processor performance, and their results for large collections of CPUs are often publicly available. However, repositories of benchmark results are not always helpful when consumers need performance data for new processors or new workloads. Moreover, the aggregate scores for benchmark suites designed to cover a broad spectrum of workload types can be misleading. To address these problems, we have developed a deep neural network (DNN) model, and we have used it to learn the relationship between the specifications of Intel CPUs and their performance on the SPEC CPU2006 and Geekbench 3 benchmark suites. We show that we can generate useful predictions for new processors and new workloads. We also cross-predict the two benchmark suites and compare their performance scores. The results quantify the self-similarity of these suites for the first time in the literature. This work should discourage consumers from basing purchasing decisions exclusively on Geekbench 3, and it should encourage academics to evaluate research using more diverse workloads than the SPEC CPU suites alone.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The marketplace for general-purpose microprocessors offers hundreds of functionally similar models, differing by traits like frequency, core count, cache size, memory bandwidth, and power consumption. Their performance depends not only on microarchitecture, but also on the nature of the workloads being executed. Given a set of intended workloads, the consumer needs both performance and price information to make rational buying decisions. Many benchmark suites have been developed to measure processor performance, and their results for large collections of CPUs are often publicly available. However, repositories of benchmark results are not always helpful when consumers need performance data for new processors or new workloads. Moreover, the aggregate scores for benchmark suites designed to cover a broad spectrum of workload types can be misleading. To address these problems, we have developed a deep neural network (DNN) model, and we have used it to learn the relationship between the specifications of Intel CPUs and their performance on the SPEC CPU2006 and Geekbench 3 benchmark suites. We show that we can generate useful predictions for new processors and new workloads. We also cross-predict the two benchmark suites and compare their performance scores. The results quantify the self-similarity of these suites for the first time in the literature. This work should discourage consumers from basing purchasing decisions exclusively on Geekbench 3, and it should encourage academics to evaluate research using more diverse workloads than the SPEC CPU suites alone. |
2018 |
Sae Kyu Lee, Paul N Whatmough, Niamh Mulholland, Patrick Hansen, David Brooks, Gu-Yeon Wei A wide dynamic range sparse FC-DNN processor with multi-cycle banked SRAM read and adaptive clocking in 16nm FinFET Journal Article ESSCIRC 2018-IEEE 44th European Solid State Circuits Conference, 2018. @article{Lee2018, title = {A wide dynamic range sparse FC-DNN processor with multi-cycle banked SRAM read and adaptive clocking in 16nm FinFET}, author = {Sae Kyu Lee, Paul N Whatmough, Niamh Mulholland, Patrick Hansen, David Brooks, Gu-Yeon Wei}, url = {https://ieeexplore.ieee.org/abstract/document/8494245}, year = {2018}, date = {2018-07-01}, journal = {ESSCIRC 2018-IEEE 44th European Solid State Circuits Conference}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Paul N Whatmough, Sae Kyu Lee, David Brooks, Gu-Yeon Wei DNN ENGINE: A 28-nm Timing-Error Tolerant Sparse Deep Neural Network Processor for IoT Applications Journal Article IEEE Journal of Solid-State Circuits (JSSC), 2018. @article{Whatmough2018, title = {DNN ENGINE: A 28-nm Timing-Error Tolerant Sparse Deep Neural Network Processor for IoT Applications}, author = {Paul N Whatmough, Sae Kyu Lee, David Brooks, Gu-Yeon Wei}, year = {2018}, date = {2018-07-01}, journal = {IEEE Journal of Solid-State Circuits (JSSC)}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Paul Whatmough, Sae Kyu Lee, Sam Xi, Udit Gupta, Lillian Pentecost, Marco Donato, Hsea-Ching Hseuh, David Brooks,; Gu-Yeon Wei. SMIV: A 16nm SoC with Efficient and Flexible DNN Acceleration for Intelligent IoT Devices Journal Article Hot Chips 30: A Symposium on High Performance Chips, 2018. @article{Whatmough2018b, title = {SMIV: A 16nm SoC with Efficient and Flexible DNN Acceleration for Intelligent IoT Devices}, author = {Paul Whatmough, Sae Kyu Lee, Sam Xi, Udit Gupta, Lillian Pentecost, Marco Donato, Hsea-Ching Hseuh, David Brooks, and Gu-Yeon Wei.}, year = {2018}, date = {2018-07-01}, journal = {Hot Chips 30: A Symposium on High Performance Chips}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Brandon Reagen, Udit Gupta, Lillian Pentecost, Paul Whatmough, Sae Kyu Lee, Niamh Mulholland, David Brooks, Gu-Yeon Wei Ares: a framework for quantifying the resilience of deep neural networks Conference Design Automation Conference, 2018. @conference{Reagen2018, title = {Ares: a framework for quantifying the resilience of deep neural networks}, author = {Brandon Reagen, Udit Gupta, Lillian Pentecost, Paul Whatmough, Sae Kyu Lee, Niamh Mulholland, David Brooks, Gu-Yeon Wei}, url = {https://dl.acm.org/citation.cfm?id=3195997}, year = {2018}, date = {2018-06-25}, booktitle = {Design Automation Conference}, abstract = {As the use of deep neural networks continues to grow, so does the fraction of compute cycles devoted to their execution. This has led the CAD and architecture communities to devote considerable attention to building DNN hardware. Despite these efforts, the fault tolerance of DNNs has generally been overlooked. This paper is the first to conduct a large-scale, empirical study of DNN resilience. Motivated by the inherent algorithmic resilience of DNNs, we are interested in understanding the relationship between fault rate and model accuracy. To do so, we present Ares: a light-weight, DNN-specific fault injection framework validated within 12% of real hardware. We find that DNN fault tolerance varies by orders of magnitude with respect to model, layer type, and structure. }, keywords = {}, pubstate = {published}, tppubtype = {conference} } As the use of deep neural networks continues to grow, so does the fraction of compute cycles devoted to their execution. This has led the CAD and architecture communities to devote considerable attention to building DNN hardware. Despite these efforts, the fault tolerance of DNNs has generally been overlooked. This paper is the first to conduct a large-scale, empirical study of DNN resilience. Motivated by the inherent algorithmic resilience of DNNs, we are interested in understanding the relationship between fault rate and model accuracy. To do so, we present Ares: a light-weight, DNN-specific fault injection framework validated within 12% of real hardware. We find that DNN fault tolerance varies by orders of magnitude with respect to model, layer type, and structure. |
Marco Donato; Brandon Reagen; Lillian Pentecost; Udit Gupta; David Brooks, Gu-Yeon Wei On-Chip Deep Neural Network Storage with Multi-Level eNVM Inproceedings Design Automation Conference (DAC), 2018. @inproceedings{Donato2018, title = {On-Chip Deep Neural Network Storage with Multi-Level eNVM}, author = {Marco Donato and Brandon Reagen and Lillian Pentecost and Udit Gupta and David Brooks, Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2018/04/dac2018-envm.pdf}, year = {2018}, date = {2018-06-24}, booktitle = {Design Automation Conference (DAC)}, abstract = {One of the biggest performance bottlenecks of today’s neural network (NN) accelerators is off-chip memory accesses. In this paper, we propose a method to use multi-level, embedded non-volatile memory (eNVM) to eliminate all off-chip weight accesses. The use of multi-level memory cells increases the probability of faults. Therefore, we co-design the weights and memories such that their properties complement each other and the faults result in no noticeable NN accuracy loss. In the extreme case, the weights in fully connected layers can be stored using a single transistor. With weight pruning and clustering, we show our technique reduces the memory area by over an order of magnitude compared to an SRAM baseline. In the case of VGG16 (130M weights), we are able to store all the weights in 4.9 mm2, well within the area allocated to SRAM in modern NN accelerators.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } One of the biggest performance bottlenecks of today’s neural network (NN) accelerators is off-chip memory accesses. In this paper, we propose a method to use multi-level, embedded non-volatile memory (eNVM) to eliminate all off-chip weight accesses. The use of multi-level memory cells increases the probability of faults. Therefore, we co-design the weights and memories such that their properties complement each other and the faults result in no noticeable NN accuracy loss. In the extreme case, the weights in fully connected layers can be stored using a single transistor. With weight pruning and clustering, we show our technique reduces the memory area by over an order of magnitude compared to an SRAM baseline. In the case of VGG16 (130M weights), we are able to store all the weights in 4.9 mm2, well within the area allocated to SRAM in modern NN accelerators. |
Brandon Reagen, Udit Gupta, Robert Adolf, Michael M. Mitzenmacher, Alexander M. Rush, Gu-Yeon Wei, David Brooks Weightless: Lossy Weight Encoding For Deep Neural Network Compression Conference International Conference on Machine Learning, 2018. @conference{Reagen2017b, title = {Weightless: Lossy Weight Encoding For Deep Neural Network Compression}, author = {Brandon Reagen, Udit Gupta, Robert Adolf, Michael M. Mitzenmacher, Alexander M. Rush, Gu-Yeon Wei, David Brooks }, url = {https://arxiv.org/abs/1711.04686}, year = {2018}, date = {2018-05-01}, booktitle = {International Conference on Machine Learning}, abstract = {The large memory requirements of deep neural networks limit their deployment and adoption on many devices. Model compression methods effectively reduce the memory requirements of these models, usually through applying transformations such as weight pruning or quantization. In this paper, we present a novel scheme for lossy weight encoding which complements conventional compression techniques. The encoding is based on the Bloomier filter, a probabilistic data structure that can save space at the cost of introducing random errors. Leveraging the ability of neural networks to tolerate these imperfections and by re-training around the errors, the proposed technique, Weightless, can compress DNN weights by up to 496x with the same model accuracy. This results in up to a 1.51x improvement over the state-of-the-art.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The large memory requirements of deep neural networks limit their deployment and adoption on many devices. Model compression methods effectively reduce the memory requirements of these models, usually through applying transformations such as weight pruning or quantization. In this paper, we present a novel scheme for lossy weight encoding which complements conventional compression techniques. The encoding is based on the Bloomier filter, a probabilistic data structure that can save space at the cost of introducing random errors. Leveraging the ability of neural networks to tolerate these imperfections and by re-training around the errors, the proposed technique, Weightless, can compress DNN weights by up to 496x with the same model accuracy. This results in up to a 1.51x improvement over the state-of-the-art. |
Mario Lok; Elizabeth Farrell Helbling; Xuan Zhang; Robert Wood; David Brooks; Gu-Yeon Wei A Low Mass Power Electronics Unit to Drive Piezoelectric Actuators for Flying Microrobots Journal Article IEEE Transactions on Power Electronics, 33 (4), pp. 3180 - 3191, 2018. @article{Lok2018, title = {A Low Mass Power Electronics Unit to Drive Piezoelectric Actuators for Flying Microrobots}, author = {Mario Lok and Elizabeth Farrell Helbling and Xuan Zhang and Robert Wood and David Brooks and Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2018/04/lok-tpe-2018.pdf}, year = {2018}, date = {2018-04-01}, journal = {IEEE Transactions on Power Electronics}, volume = {33}, number = {4}, pages = {3180 - 3191}, abstract = {This paper presents a power electronics design for the piezoelectric actuators of an insect-scale flapping-wing robot, the RoboBee. The proposed design outputs four high-voltage drive signals tailored for the two bimorph actuators of the RoboBee in an alternating drive configuration. It utilizes fully integrated drive stage circuits with a novel highside gate driver to save chip area and meet the strict mass constraint of the RoboBee. Compared with previous integrated designs, it also boosts efficiency in delivering energy to the actuators and recovering unused energy by applying three power saving techniques, dynamic common mode adjustment, envelope tracking, and charge sharing. Using this design to energize four 15 nF capacitor loads with a 200 V and 100 Hz drive signal and tracking the control commands recorded from an actual flight experiment for the robot, we measure an average power consumption of 290 mW.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper presents a power electronics design for the piezoelectric actuators of an insect-scale flapping-wing robot, the RoboBee. The proposed design outputs four high-voltage drive signals tailored for the two bimorph actuators of the RoboBee in an alternating drive configuration. It utilizes fully integrated drive stage circuits with a novel highside gate driver to save chip area and meet the strict mass constraint of the RoboBee. Compared with previous integrated designs, it also boosts efficiency in delivering energy to the actuators and recovering unused energy by applying three power saving techniques, dynamic common mode adjustment, envelope tracking, and charge sharing. Using this design to energize four 15 nF capacitor loads with a 200 V and 100 Hz drive signal and tracking the control commands recorded from an actual flight experiment for the robot, we measure an average power consumption of 290 mW. |
2017 |
Sreela Kodali; Patrick Hansen; Niamh Mulholland; Paul Whatmough; David Brooks; Gu-Yeon Wei Applications of Deep Neural Networks for Ultra Low Power IoT Inproceedings International Conference on Computer Design, 2017. @inproceedings{Kodali2017, title = {Applications of Deep Neural Networks for Ultra Low Power IoT}, author = {Sreela Kodali and Patrick Hansen and Niamh Mulholland and Paul Whatmough and David Brooks and Gu-Yeon Wei}, year = {2017}, date = {2017-11-05}, booktitle = {International Conference on Computer Design}, abstract = {IoT devices are increasing in prevalence and popularity, becoming an indispensable part of daily life. Despite the stringent energy and computational constraints of IoT systems, specialized hardware can enable energy-efficient sensor-data classification in an increasingly diverse range of IoT applications. This paper demonstrates seven different IoT applications using a fully-connected deep neural network (FC-NN) accelerator on 28nm CMOS. The applications include audio keyword spotting, face recognition, and human activity recognition. For each application, a FC-NN model was trained from a preprocessed dataset and mapped to the accelerator. Experimental results indicate the models retained their state-of-the-art accuracy on the accelerator across a broad range of frequencies and voltages. Real-time energy results for the applications were found to be on the order of 100nJ per inference or lower.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } IoT devices are increasing in prevalence and popularity, becoming an indispensable part of daily life. Despite the stringent energy and computational constraints of IoT systems, specialized hardware can enable energy-efficient sensor-data classification in an increasingly diverse range of IoT applications. This paper demonstrates seven different IoT applications using a fully-connected deep neural network (FC-NN) accelerator on 28nm CMOS. The applications include audio keyword spotting, face recognition, and human activity recognition. For each application, a FC-NN model was trained from a preprocessed dataset and mapped to the accelerator. Experimental results indicate the models retained their state-of-the-art accuracy on the accelerator across a broad range of frequencies and voltages. Real-time energy results for the applications were found to be on the order of 100nJ per inference or lower. |
Paul Whatmough; Sae Kyu Lee; Gu-Yeon Wei; David Brooks Sub-uJ Deep Neural Networks for Embedded Applications Inproceedings IEEE 51st Asilomar Conference on Signals, Systems, and Computers, 2017. @inproceedings{Whatmough2017b, title = {Sub-uJ Deep Neural Networks for Embedded Applications}, author = {Paul Whatmough and Sae Kyu Lee and Gu-Yeon Wei and David Brooks}, year = {2017}, date = {2017-10-01}, booktitle = {IEEE 51st Asilomar Conference on Signals, Systems, and Computers}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Paul Whatmough; Saekyu Lee; Niamh Mulholland; Patrick Hansen; Sreela Kodali; David Brooks DNN ENGINE: A 16nm Sub-uJ Deep Neural Network Inference Accelerator for the Embedded Masses Inproceedings Hot Chips 29: A Symposium on High Performance Chips, 2017. @inproceedings{Whatmough2017b, title = {DNN ENGINE: A 16nm Sub-uJ Deep Neural Network Inference Accelerator for the Embedded Masses}, author = {Paul Whatmough and Saekyu Lee and Niamh Mulholland and Patrick Hansen and Sreela Kodali and David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2018/04/HC29.22.711-DNN-Engine-Whatmough-ARM-0.6_clean.pdf}, year = {2017}, date = {2017-08-22}, booktitle = {Hot Chips 29: A Symposium on High Performance Chips}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Brandon Reagen, Robert Adolf, Paul Whatmough, Gu-Yeon Wei, David Brooks Deep Learning for Computer Architects Book Morgan & Claypool Publishers, 2017. @book{Reagen2017b, title = {Deep Learning for Computer Architects}, author = {Brandon Reagen, Robert Adolf, Paul Whatmough, Gu-Yeon Wei, David Brooks}, url = {http://www.morganclaypool.com/doi/abs/10.2200/S00783ED1V01Y201706CAC041}, year = {2017}, date = {2017-08-01}, publisher = {Morgan & Claypool Publishers}, series = {Synthesis Lectures on Computer Architecture}, abstract = {Machine learning, and specifically deep learning, has been hugely disruptive in many fields of computer science. The success of deep learning techniques in solving notoriously difficult classification and regression problems has resulted in their rapid adoption in solving real-world problems. The emergence of deep learning is widely attributed to a virtuous cycle whereby fundamental advancements in training deeper models were enabled by the availability of massive datasets and high-performance computer hardware. This text serves as a primer for computer architects in a new and rapidly evolving field. We review how machine learning has evolved since its inception in the 1960s and track the key developments leading up to the emergence of the powerful deep learning techniques that emerged in the last decade. Next we review representative workloads, including the most commonly used datasets and seminal networks across a variety of domains. In addition to discussing the workloads themselves, we also detail the most popular deep learning tools and show how aspiring practitioners can use the tools with the workloads to characterize and optimize DNNs. The remainder of the book is dedicated to the design and optimization of hardware and architectures for machine learning. As high-performance hardware was so instrumental in the success of machine learning becoming a practical solution, this chapter recounts a variety of optimizations proposed recently to further improve future designs. Finally, we present a review of recent research published in the area as well as a taxonomy to help readers understand how various contributions fall in context.}, keywords = {}, pubstate = {published}, tppubtype = {book} } Machine learning, and specifically deep learning, has been hugely disruptive in many fields of computer science. The success of deep learning techniques in solving notoriously difficult classification and regression problems has resulted in their rapid adoption in solving real-world problems. The emergence of deep learning is widely attributed to a virtuous cycle whereby fundamental advancements in training deeper models were enabled by the availability of massive datasets and high-performance computer hardware. This text serves as a primer for computer architects in a new and rapidly evolving field. We review how machine learning has evolved since its inception in the 1960s and track the key developments leading up to the emergence of the powerful deep learning techniques that emerged in the last decade. Next we review representative workloads, including the most commonly used datasets and seminal networks across a variety of domains. In addition to discussing the workloads themselves, we also detail the most popular deep learning tools and show how aspiring practitioners can use the tools with the workloads to characterize and optimize DNNs. The remainder of the book is dedicated to the design and optimization of hardware and architectures for machine learning. As high-performance hardware was so instrumental in the success of machine learning becoming a practical solution, this chapter recounts a variety of optimizations proposed recently to further improve future designs. Finally, we present a review of recent research published in the area as well as a taxonomy to help readers understand how various contributions fall in context. |
Brandon Reagen; Jose Miguel Hernandez-Lobato; Robert Adolf; Michael Gelbart; Paul Whatmough; Gu-Yeon Wei; David Brooks A Case for Efficient Accelerator Design Space Exploration via Bayesian Optimization Conference International Symposium on Low Power Electronics and Design, 2017. @conference{Reagen2017, title = {A Case for Efficient Accelerator Design Space Exploration via Bayesian Optimization}, author = {Brandon Reagen and Jose Miguel Hernandez-Lobato and Robert Adolf and Michael Gelbart and Paul Whatmough and Gu-Yeon Wei and David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2017/07/reagen_islped2017.pdf}, year = {2017}, date = {2017-07-24}, booktitle = {International Symposium on Low Power Electronics and Design}, abstract = {In this paper we propose using machine learning to improve the design of deep neural network hardware accelerators. We show how to adapt multi-objective Bayesian optimization to overcome a challenging design problem: optimizing deep neural network hardware accelerators for both accuracy and energy efficiency. DNN accelerators exhibit all aspects of a challenging optimization space: the landscape is rough, evaluating designs is expensive, the objectives compete with each other, and both design spaces (algorithmic and microarchitectural) are unwieldy. With multi-objective Bayesian optimization, the design space exploration is made tractable and the design points found vastly outperform traditional methods across all metrics of interest.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In this paper we propose using machine learning to improve the design of deep neural network hardware accelerators. We show how to adapt multi-objective Bayesian optimization to overcome a challenging design problem: optimizing deep neural network hardware accelerators for both accuracy and energy efficiency. DNN accelerators exhibit all aspects of a challenging optimization space: the landscape is rough, evaluating designs is expensive, the objectives compete with each other, and both design spaces (algorithmic and microarchitectural) are unwieldy. With multi-objective Bayesian optimization, the design space exploration is made tractable and the design points found vastly outperform traditional methods across all metrics of interest. |
Xuan Zhang; Mario Lok; Tao Tong; Sae Kyu Lee; Brandon Reagen; Pierre-Emile J. Duhamel; Robert Wood; David Brooks; Gu-Yeon Wei A Fully Integrated Battery-Powered System-on-Chip in 40-nm CMOS for Closed-Loop Control of Insect-Scale Pico-Aerial Vehicle Journal Article IEEE Journal of Solid-State Circuits, 52 (9), 2017. @article{Zhang2017, title = {A Fully Integrated Battery-Powered System-on-Chip in 40-nm CMOS for Closed-Loop Control of Insect-Scale Pico-Aerial Vehicle}, author = {Xuan Zhang and Mario Lok and Tao Tong and Sae Kyu Lee and Brandon Reagen and Pierre-Emile J. Duhamel and Robert Wood and David Brooks and Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2018/04/robobee-jssc.pdf}, year = {2017}, date = {2017-06-12}, journal = {IEEE Journal of Solid-State Circuits}, volume = {52}, number = {9}, abstract = {We demonstrate a fully integrated system-on-chip (SoC) optimized for insect-scale flapping-wing pico-aerial vehicles. The SoC is able to meet the stringent weight, power, and real-time performance demands of autonomous flight for a bee-sized robot. The entire integrated system with embedded voltage regulation, data conversion, clock generation, as well as both general-purpose and accelerated computing units, weighs less than 3 mg after die thinning. It is self-contained and can be powered directly off of a lithium battery. Measured results show open-loop wing flapping controlled by the SoC and improved energy efficiency through the use of hardware acceleration and supply resilience through the use of adaptive clocking.}, keywords = {}, pubstate = {published}, tppubtype = {article} } We demonstrate a fully integrated system-on-chip (SoC) optimized for insect-scale flapping-wing pico-aerial vehicles. The SoC is able to meet the stringent weight, power, and real-time performance demands of autonomous flight for a bee-sized robot. The entire integrated system with embedded voltage regulation, data conversion, clock generation, as well as both general-purpose and accelerated computing units, weighs less than 3 mg after die thinning. It is self-contained and can be powered directly off of a lithium battery. Measured results show open-loop wing flapping controlled by the SoC and improved energy efficiency through the use of hardware acceleration and supply resilience through the use of adaptive clocking. |
Svilen Kanev; Sam (Likun) Xi; Gu-Yeon Wei; David Brooks Mallacc: Accelerating Memory Allocation Conference International Symposium on Architectural Support for Programming Languages and Operating Systems (ASPLOS), 2017. @conference{Kanev2017, title = {Mallacc: Accelerating Memory Allocation}, author = {Svilen Kanev and Sam (Likun) Xi and Gu-Yeon Wei and David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2017/02/asplos17mallacc.pdf}, year = {2017}, date = {2017-04-08}, booktitle = {International Symposium on Architectural Support for Programming Languages and Operating Systems (ASPLOS)}, abstract = {Recent work shows that dynamic memory allocation consumes nearly 7% of all cycles in Google datacenters. With the trend towards increased specialization of hardware, we propose Mallacc, an in-core hardware accelerator designed for broad use across a number of high-performance, modern memory allocators. The design of Mallacc is quite different from traditional throughput-oriented hardware accelerators. Because memory allocation requests tend to be very frequent, fast, and interspersed inside other application code, accelerators must be optimized for latency rather than throughput and area overheads must be kept to a bare minimum. Mallacc accelerates the three primary operations of a typical memory allocation request: size class computation, retrieval of a free memory block, and sampling of memory usage. Our results show that malloc latency can be reduced by up to 50% with a hardware cost of less than 1500 μm 2 of silicon area, less than 0.006% of a typical high-performance processor core.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Recent work shows that dynamic memory allocation consumes nearly 7% of all cycles in Google datacenters. With the trend towards increased specialization of hardware, we propose Mallacc, an in-core hardware accelerator designed for broad use across a number of high-performance, modern memory allocators. The design of Mallacc is quite different from traditional throughput-oriented hardware accelerators. Because memory allocation requests tend to be very frequent, fast, and interspersed inside other application code, accelerators must be optimized for latency rather than throughput and area overheads must be kept to a bare minimum. Mallacc accelerates the three primary operations of a typical memory allocation request: size class computation, retrieval of a free memory block, and sampling of memory usage. Our results show that malloc latency can be reduced by up to 50% with a hardware cost of less than 1500 μm 2 of silicon area, less than 0.006% of a typical high-performance processor core. |
Sae Kyu Lee; Tao Tong; Xuan Zhang; David Brooks; Gu-Yeon Wei A 16-Core Voltage-Stacked System With Adaptive Clocking and an Integrated Switched-Capacitor DC–DC Converter Journal Article IEEE Transactions on VLSI, 25 (4), pp. 1271-1284, 2017. @article{Lee2017, title = {A 16-Core Voltage-Stacked System With Adaptive Clocking and an Integrated Switched-Capacitor DC–DC Converter}, author = {Sae Kyu Lee and Tao Tong and Xuan Zhang and David Brooks and Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2017/06/sklee_tvlsi2017_final_print.pdf}, year = {2017}, date = {2017-04-01}, journal = {IEEE Transactions on VLSI}, volume = {25}, number = {4}, pages = {1271-1284}, abstract = {This paper presents a 16-core voltage-stacked system with adaptive frequency clocking (AFClk) and a fully integrated voltage regulator that demonstrates efficient on-chip power delivery for multicore systems. Voltage stacking alleviates power delivery inefficiencies due to off-chip parasitics but adds complexity to combat internal voltage noise. To address the corresponding issue of internal voltage noise, the system utilizes an AFClk scheme with an efficient switched-capacitor dc-dc converter to mitigate noise on the stack layers and to improve system performance and efficiency. Experimental results demonstrate robust voltage noise mitigation as well as the potential of voltage stacking as a highly efficient power delivery scheme. This paper also illustrates that augmenting the hardware techniques with intelligent workload allocation that exploits the inherent properties of voltage stacking can preemptively reduce the interlayer activity mismatch and improve system efficiency.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper presents a 16-core voltage-stacked system with adaptive frequency clocking (AFClk) and a fully integrated voltage regulator that demonstrates efficient on-chip power delivery for multicore systems. Voltage stacking alleviates power delivery inefficiencies due to off-chip parasitics but adds complexity to combat internal voltage noise. To address the corresponding issue of internal voltage noise, the system utilizes an AFClk scheme with an efficient switched-capacitor dc-dc converter to mitigate noise on the stack layers and to improve system performance and efficiency. Experimental results demonstrate robust voltage noise mitigation as well as the potential of voltage stacking as a highly efficient power delivery scheme. This paper also illustrates that augmenting the hardware techniques with intelligent workload allocation that exploits the inherent properties of voltage stacking can preemptively reduce the interlayer activity mismatch and improve system efficiency. |
Paul N. Whatmough; Sae Kyu Lee; Hyunkwang Lee; Saketh Rama; David Brooks; Gu-Yeon Wei International Solid-State Circuits Conference, 2017. @inproceedings{Whatmough2017, title = {A 28nm SoC with a 1.2GHz 568nJ/Prediction Sparse Deep-Neural-Network Engine with >0.1 Timing Error Rate Tolerance for IoT Applications}, author = {Paul N. Whatmough and Sae Kyu Lee and Hyunkwang Lee and Saketh Rama and David Brooks and Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2017/02/whatmough_isscc2017.pdf http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2017/02/whatmough_isscc2017_slides.pdf}, year = {2017}, date = {2017-02-05}, booktitle = {International Solid-State Circuits Conference}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2016 |
Yakun Sophia Shao; Sam (Likun) Xi; Vijayalakshmi Srinivasan; Gu-Yeon Wei; David Brooks Co-Designing Accelerators and SoC Interfaces using gem5-Aladdin Inproceedings International Symposium on Microarchitecture (MICRO), 2016. @inproceedings{Shao2016, title = {Co-Designing Accelerators and SoC Interfaces using gem5-Aladdin}, author = {Yakun Sophia Shao and Sam (Likun) Xi and Vijayalakshmi Srinivasan and Gu-Yeon Wei and David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2016/08/shao_micro2016.pdf}, year = {2016}, date = {2016-10-17}, booktitle = {International Symposium on Microarchitecture (MICRO)}, abstract = {Increasing demand for power-efficient, high- performance computing has spurred a growing number and diversity of hardware accelerators in mobile and server Systems on Chip (SoCs). This paper makes the case that the co-design of the accelerator microarchitecture with the system in which it belongs is critical to balanced, efficient accelerator microarchitectures. We find that data movement and coherence management for accelerators are significant yet often unaccounted components of total accelerator runtime, resulting in misleading performance predictions and inefficient accelerator designs. To explore the design space of accelerator-system co-design, we develop gem5-Aladdin, an SoC simulator that captures dynamic interactions between accelerators and the SoC platform, and validate it to within 6% against real hardware. Our co-design studies show that the optimal energy-delay-product (EDP) of an accelerator microarchitecture can improve by up to 7.4x when system-level effects are considered compared to optimizing accelerators in isolation.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Increasing demand for power-efficient, high- performance computing has spurred a growing number and diversity of hardware accelerators in mobile and server Systems on Chip (SoCs). This paper makes the case that the co-design of the accelerator microarchitecture with the system in which it belongs is critical to balanced, efficient accelerator microarchitectures. We find that data movement and coherence management for accelerators are significant yet often unaccounted components of total accelerator runtime, resulting in misleading performance predictions and inefficient accelerator designs. To explore the design space of accelerator-system co-design, we develop gem5-Aladdin, an SoC simulator that captures dynamic interactions between accelerators and the SoC platform, and validate it to within 6% against real hardware. Our co-design studies show that the optimal energy-delay-product (EDP) of an accelerator microarchitecture can improve by up to 7.4x when system-level effects are considered compared to optimizing accelerators in isolation. |
Robert Adolf; Saketh Rama; Brandon Reagen; Gu-Yeon Wei ; David Brooks Fathom: Reference Workloads for Modern Deep Learning Methods Inproceedings IEEE International Symposium on Workload Characterization, 2016. @inproceedings{Adolf2016, title = {Fathom: Reference Workloads for Modern Deep Learning Methods}, author = {Robert Adolf and Saketh Rama and Brandon Reagen and Gu-Yeon Wei and David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2016/08/iiswc2016-final.pdf http://arxiv.org/abs/1608.06581 https://rdadolf.github.io/fathom/}, year = {2016}, date = {2016-09-25}, booktitle = {IEEE International Symposium on Workload Characterization}, abstract = {Deep learning has been popularized by its recent successes on challenging artificial intelligence problems. One of the reasons for its dominance is also an ongoing challenge: the need for immense amounts of computational power. Hardware architects have responded by proposing a wide array of promising ideas, but to date, the majority of the work has focused on specific algorithms in somewhat narrow application domains. While their specificity does not diminish these approaches, there is a clear need for more flexible solutions. We believe the first step is to examine the characteristics of cutting edge models from across the deep learning community. Consequently, we have assembled Fathom: a collection of eight archetypal deep learning workloads for study. Each of these models comes from a seminal work in the deep learning community, ranging from the familiar deep convolutional neural network of Krizhevsky et al., to the more exotic memory networks from Facebook’s AI research group. Fathom has been released online, and this paper focuses on understanding the fundamental performance characteristics of each model. We use a set of application-level modeling tools built around the TensorFlow deep learning framework in order to analyze the behavior of the Fathom workloads. We present a breakdown of where time is spent, the similarities between the performance profiles of our models, an analysis of behavior in inference and training, and the effects of parallelism on scaling.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Deep learning has been popularized by its recent successes on challenging artificial intelligence problems. One of the reasons for its dominance is also an ongoing challenge: the need for immense amounts of computational power. Hardware architects have responded by proposing a wide array of promising ideas, but to date, the majority of the work has focused on specific algorithms in somewhat narrow application domains. While their specificity does not diminish these approaches, there is a clear need for more flexible solutions. We believe the first step is to examine the characteristics of cutting edge models from across the deep learning community. Consequently, we have assembled Fathom: a collection of eight archetypal deep learning workloads for study. Each of these models comes from a seminal work in the deep learning community, ranging from the familiar deep convolutional neural network of Krizhevsky et al., to the more exotic memory networks from Facebook’s AI research group. Fathom has been released online, and this paper focuses on understanding the fundamental performance characteristics of each model. We use a set of application-level modeling tools built around the TensorFlow deep learning framework in order to analyze the behavior of the Fathom workloads. We present a breakdown of where time is spent, the similarities between the performance profiles of our models, an analysis of behavior in inference and training, and the effects of parallelism on scaling. |
Tao Tong; Sae Kyu Lee; Xuan Zhang; David Brooks; Gu-Yeon Wei A Fully Integrated Reconfigurable Switched-Capacitor DC-DC Converter With Four Stacked Output Channels for Voltage Stacking Applications Journal Article IEEE Journal of Solid-State Circuits, 51 (9), pp. 2142–2152, 2016. @article{Tong2016, title = {A Fully Integrated Reconfigurable Switched-Capacitor DC-DC Converter With Four Stacked Output Channels for Voltage Stacking Applications}, author = {Tao Tong and Sae Kyu Lee and Xuan Zhang and David Brooks and Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2017/06/Tong_JSSC2016_final.pdf}, year = {2016}, date = {2016-09-18}, journal = {IEEE Journal of Solid-State Circuits}, volume = {51}, number = {9}, pages = {2142--2152}, abstract = {This work presents a fully integrated 4-to-1 DC-DC symmetric ladder switched-capacitor converter (SLSCC) for voltage stacking applications. The SLSCC absorbs inter-layer load power mismatch to provide minimum voltage guarantees for the internal rails of a multicore system that implements four-way voltage stacking. A new hybrid feedback control scheme reduces the voltage ripple across stacked voltage layers for high levels of current mismatch, a condition that exacerbates voltage noise in conventional SC converters. Furthermore, the proposed SLSCC dynamically allocates valuable flying capacitor resources according to different load conditions, which improves conversion efficiency and supports more power mismatch between the layers. Implemented in TSMC’s 40G process, the SLSCC converts a 3.6 V input voltage down to four stacked output voltage layers, each nominally at 900 mV.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This work presents a fully integrated 4-to-1 DC-DC symmetric ladder switched-capacitor converter (SLSCC) for voltage stacking applications. The SLSCC absorbs inter-layer load power mismatch to provide minimum voltage guarantees for the internal rails of a multicore system that implements four-way voltage stacking. A new hybrid feedback control scheme reduces the voltage ripple across stacked voltage layers for high levels of current mismatch, a condition that exacerbates voltage noise in conventional SC converters. Furthermore, the proposed SLSCC dynamically allocates valuable flying capacitor resources according to different load conditions, which improves conversion efficiency and supports more power mismatch between the layers. Implemented in TSMC’s 40G process, the SLSCC converts a 3.6 V input voltage down to four stacked output voltage layers, each nominally at 900 mV. |
Brandon Reagen; Paul Whatmough; Robert Adolf; Saketh Rama; Hyunkwang Lee; Sae Kyu Lee; José Miguel Hernández-Lobato; Gu-Yeon Wei; David Brooks Minerva: Enabling Low-Power, Highly-Accurate Deep Neural Network Accelerators Inproceedings International Symposium on Computer Architecture (ISCA), 2016. @inproceedings{Reagen2016, title = {Minerva: Enabling Low-Power, Highly-Accurate Deep Neural Network Accelerators}, author = {Brandon Reagen and Paul Whatmough and Robert Adolf and Saketh Rama and Hyunkwang Lee and Sae Kyu Lee and José Miguel Hernández-Lobato and Gu-Yeon Wei and David Brooks}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2016/05/reagen_isca16.pdf}, year = {2016}, date = {2016-06-18}, booktitle = {International Symposium on Computer Architecture (ISCA)}, abstract = {The continued success of Deep Neural Networks (DNNs) in classification tasks has sparked a trend of accelerating their execution with specialized hardware. While published designs easily give an order of magnitude improvement over general-purpose hardware, few look beyond an initial implementation. This paper presents Minerva, a highly automated co-design approach across the algorithm, architecture, and circuit levels to optimize DNN hardware accelerators. Compared to an established fixed-point accelerator baseline, we show that fine-grained, heterogeneous datatype optimization reduces power by 1.5×; aggressive, inline predication and pruning of small activity values further reduces power by 2.0×; and active hardware fault detection coupled with domain-aware error mitigation eliminates an additional 2.7× through lowering SRAM voltages. Across five datasets, these optimizations provide a collective average of 8.1× power reduction over an accelerator baseline without compromising DNN model accuracy. Minerva enables highly accurate, ultra-low power DNN accelerators (in the range of tens of milliwatts), making it feasible to deploy DNNs in power-constrained IoT and mobile devices.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The continued success of Deep Neural Networks (DNNs) in classification tasks has sparked a trend of accelerating their execution with specialized hardware. While published designs easily give an order of magnitude improvement over general-purpose hardware, few look beyond an initial implementation. This paper presents Minerva, a highly automated co-design approach across the algorithm, architecture, and circuit levels to optimize DNN hardware accelerators. Compared to an established fixed-point accelerator baseline, we show that fine-grained, heterogeneous datatype optimization reduces power by 1.5×; aggressive, inline predication and pruning of small activity values further reduces power by 2.0×; and active hardware fault detection coupled with domain-aware error mitigation eliminates an additional 2.7× through lowering SRAM voltages. Across five datasets, these optimizations provide a collective average of 8.1× power reduction over an accelerator baseline without compromising DNN model accuracy. Minerva enables highly accurate, ultra-low power DNN accelerators (in the range of tens of milliwatts), making it feasible to deploy DNNs in power-constrained IoT and mobile devices. |
2015 |
Mario Lok; Xuan Zhang; Elizabeth Farrell Helblinh; Robert Wood; David Brooks; Gu-Yeon Wei A Power Electronics Unit to Drive Piezoelectric Actuators for Flying Microrobots Inproceedings IEEE Custom Integrated Circuits Conference (CICC), 2015. @inproceedings{Lok2015, title = {A Power Electronics Unit to Drive Piezoelectric Actuators for Flying Microrobots}, author = {Mario Lok and Xuan Zhang and Elizabeth Farrell Helblinh and Robert Wood and David Brooks and Gu-Yeon Wei}, url = {https://micro.seas.harvard.edu/papers/Lok_CICC_2015.pdf}, year = {2015}, date = {2015-09-01}, booktitle = {IEEE Custom Integrated Circuits Conference (CICC)}, abstract = {This paper describes a power electronics unit (PEU) for an insect-scale flapping-wing robot. Three power saving techniques used in the actuator driver of the PEU — envelope tracking, dynamic common mode, and charge sharing — reduce power consumption while retaining weight benefits of an inductor-less linear driver. A pair of actuator driver ICs energize four 15nF capacitor loads, which represent the piezoelectric actuators of a flapping-wing robot. The PEU consumes 290mW, which translates to 37% lower power compared to a design without these power saving techniques.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper describes a power electronics unit (PEU) for an insect-scale flapping-wing robot. Three power saving techniques used in the actuator driver of the PEU — envelope tracking, dynamic common mode, and charge sharing — reduce power consumption while retaining weight benefits of an inductor-less linear driver. A pair of actuator driver ICs energize four 15nF capacitor loads, which represent the piezoelectric actuators of a flapping-wing robot. The PEU consumes 290mW, which translates to 37% lower power compared to a design without these power saving techniques. |
Xuan Zhang; Mario Lok; Tao Tong; Simon Chaput; Sae Kyu Lee; Brandon Reagen; Hyunkwang Lee; David Brooks; Gu-Yeon Wei A Multi-Chip System Optimized for Insect-Scale Flapping-Wing Robots Inproceedings IEEE Symposium on VLSI Circuits (VLSIC), 2015. @inproceedings{Zhang2015, title = {A Multi-Chip System Optimized for Insect-Scale Flapping-Wing Robots}, author = {Xuan Zhang and Mario Lok and Tao Tong and Simon Chaput and Sae Kyu Lee and Brandon Reagen and Hyunkwang Lee and David Brooks and Gu-Yeon Wei}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2015/06/vlsi2015_robobee.pdf}, year = {2015}, date = {2015-06-16}, booktitle = {IEEE Symposium on VLSI Circuits (VLSIC)}, abstract = {We demonstrate a battery-powered multi-chip system optimized for insect-scale flapping wing robots that meets the tight weight limit and real-time performance demands of autonomous flight. Measured results show open-loop wing flapping driven by a power electronics unit and energy efficiency improvements via hardware acceleration. }, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We demonstrate a battery-powered multi-chip system optimized for insect-scale flapping wing robots that meets the tight weight limit and real-time performance demands of autonomous flight. Measured results show open-loop wing flapping driven by a power electronics unit and energy efficiency improvements via hardware acceleration. |
Sae Kyu Lee; Tao Tong; Xuang Zhang; David Brooks; Gu-Yeon Wei A 16-Core Voltage-Stacked System with an Integrated Switched-Capacitor DC-DC Converter Inproceedings IEEE Symposium on VLSI Circuits (VLSIC), 2015. @inproceedings{Lee2015, title = {A 16-Core Voltage-Stacked System with an Integrated Switched-Capacitor DC-DC Converter}, author = {Sae Kyu Lee and Tao Tong and Xuang Zhang and David Brooks and Gu-Yeon Wei }, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2015/06/VLSI_2015_FVS_Submission_Final.pdf}, year = {2015}, date = {2015-06-16}, booktitle = {IEEE Symposium on VLSI Circuits (VLSIC)}, abstract = {A 16-core voltage-stacked IC integrated with a switched-capacitor DC-DC converter demonstrates efficient power delivery. To overcome inter-layer voltage noise issues, the test chip implements and evaluates the benefits of self-timed clocking and clock-phase interleaving. The integrated converter offers minimum voltage guarantees and further reduces voltage noise. }, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } A 16-core voltage-stacked IC integrated with a switched-capacitor DC-DC converter demonstrates efficient power delivery. To overcome inter-layer voltage noise issues, the test chip implements and evaluates the benefits of self-timed clocking and clock-phase interleaving. The integrated converter offers minimum voltage guarantees and further reduces voltage noise. |
Paul N. Whatmough; George Smart; Shidhartha Das; Yiannis Andreopoulos; David M. Bull A 0.6V All-Digital Body-Coupled Wakeup Transceiver for IoT Applications Inproceedings IEEE Symposium on VLSI Circuits (VLSIC), 2015. @inproceedings{Whatmough2015, title = {A 0.6V All-Digital Body-Coupled Wakeup Transceiver for IoT Applications }, author = {Paul N. Whatmough and George Smart and Shidhartha Das and Yiannis Andreopoulos and David M. Bull}, url = {http://vlsiarch.eecs.harvard.edu/wp-content/uploads/2015/06/C5_3.pdf}, year = {2015}, date = {2015-06-16}, booktitle = {IEEE Symposium on VLSI Circuits (VLSIC)}, abstract = {A body-coupled symmetric wakeup transceiver is proposed for always-on device discovery in IoT applications requiring security and low-power consumption. The wakeup transceiver (WTRx) is implemented in 65nm CMOS, using digital logic cells and operates at 0.6V. A directly-modulated open-loop DCO generates an OOK-modulated 10MHz carrier, with a frequency-locked loop for intermittent calibration. A passive receiver incorporates a digital IO cell as hysteretic comparator, with a two-phase correlator bank. A novel MAC scheme allows for duty-cycling in both transmitter and receiver. Measured power consumption is 3.54μW, with sensitivity of 88mV and maximum wakeup latency of 150ms.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } A body-coupled symmetric wakeup transceiver is proposed for always-on device discovery in IoT applications requiring security and low-power consumption. The wakeup transceiver (WTRx) is implemented in 65nm CMOS, using digital logic cells and operates at 0.6V. A directly-modulated open-loop DCO generates an OOK-modulated 10MHz carrier, with a frequency-locked loop for intermittent calibration. A passive receiver incorporates a digital IO cell as hysteretic comparator, with a two-phase correlator bank. A novel MAC scheme allows for duty-cycling in both transmitter and receiver. Measured power consumption is 3.54μW, with sensitivity of 88mV and maximum wakeup latency of 150ms. |
Svilen Kanev, Juan Pablo Darago, Kim Hazelwood, Parthasarathy Ranganathan, Tipp Moseley, Gu-Yeon Wei, David Brooks Profiling a Warehouse-Scale Computer Inproceedings International Symposium on Computer Architecture (ISCA), 2015. @inproceedings{kanev15wsc, title = {Profiling a Warehouse-Scale Computer}, author = {Svilen Kanev, Juan Pablo Darago, Kim Hazelwood, Parthasarathy Ranganathan, Tipp Moseley, Gu-Yeon Wei, David Brooks}, url = {http://www.eecs.harvard.edu/~skanev/papers/isca15wsc.pdf}, year = {2015}, date = {2015-06-15}, booktitle = {International Symposium on Computer Architecture (ISCA)}, abstract = {With the increasing prevalence of warehouse-scale (WSC) and cloud computing, understanding the interactions of server applications with the underlying microarchitecture becomes ever more important in order to extract maximum performance out of server hardware. To aid such understanding, this paper presents a detailed microarchitectural analysis of live datacenter jobs, measured on more than 20,000 Google machines over a three year period, and comprising thousands of different applications. We first find that WSC workloads are extremely diverse, breeding the need for architectures that can tolerate application variability without performance loss. However, some patterns emerge, offering opportunities for co-optimization of hardware and software. For example, we identify common building blocks in the lower levels of the software stack. This \"datacenter tax\" can comprise nearly 30% of cycles across jobs running in the fleet, which makes its constituents prime candidates for hardware specialization in future server systems-on-chips. We also uncover opportunities for classic microarchitectural optimizations for server processors, especially in the cache hierarchy. Typical workloads place significant stress on instruction caches and prefer memory latency over bandwidth. They also stall cores often, but compute heavily in bursts. These observations motivate several interesting directions for future warehouse-scale computers.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } With the increasing prevalence of warehouse-scale (WSC) and cloud computing, understanding the interactions of server applications with the underlying microarchitecture becomes ever more important in order to extract maximum performance out of server hardware. To aid such understanding, this paper presents a detailed microarchitectural analysis of live datacenter jobs, measured on more than 20,000 Google machines over a three year period, and comprising thousands of different applications. We first find that WSC workloads are extremely diverse, breeding the need for architectures that can tolerate application variability without performance loss. However, some patterns emerge, offering opportunities for co-optimization of hardware and software. For example, we identify common building blocks in the lower levels of the software stack. This "datacenter tax" can comprise nearly 30% of cycles across jobs running in the fleet, which makes its constituents prime candidates for hardware specialization in future server systems-on-chips. We also uncover opportunities for classic microarchitectural optimizations for server processors, especially in the cache hierarchy. Typical workloads place significant stress on instruction caches and prefer memory latency over bandwidth. They also stall cores often, but compute heavily in bursts. These observations motivate several interesting directions for future warehouse-scale computers. |
Sam Xi; Hans Jacobson; Pradip Bose; Gu-Yeon Wei; David Brooks Quantifying Sources of Error in McPAT and Potential Impacts on Architectural Studies Conference International Symposium on High Performance Computer Architecture (HPCA), 2015. @conference{Xi2015_hpca, title = {Quantifying Sources of Error in McPAT and Potential Impacts on Architectural Studies}, author = {Sam Xi and Hans Jacobson and Pradip Bose and Gu-Yeon Wei and David Brooks}, url = {http://www.samxi.org/papers/xi_hpca2015.pdf}, year = {2015}, date = {2015-02-07}, booktitle = {International Symposium on High Performance Computer Architecture (HPCA)}, abstract = {Architectural power modeling tools are widely used by the computer architecture community for rapid evaluations of high-level design choices and design space explorations. Currently, McPAT is the de facto power model, but the literature does not yet contain a careful examination of its modeling accuracy. In addition, the issue of how greatly power modeling error can affect architectural-level studies has not been quantified before. In this work, we present the first rigorous assessment of McPAT’s core power and area models with a detailed, validated power modeling toolchain used in current industrial practice. We find that McPAT’s predictions can have significant error because some of the models are either incomplete, too high-level, or assume implementations of structures that differ from that of the core at hand. We demonstrate that large errors are possible when using McPAT’s dynamic power estimates in the context of voltage noise and thermal hotspots, but for steady-state properties, accurately modeling leakage power is more important. Based on our analysis, we are able to provide guidelines for creating accurate McPAT models, even without access to detailed industrial power modeling tools. We conclude that in spite of its accuracy gaps, McPAT is still a very useful tool for many architectural studies, and its limitations can often be adequately addressed for a given research study of interest. }, keywords = {}, pubstate = {published}, tppubtype = {conference} } Architectural power modeling tools are widely used by the computer architecture community for rapid evaluations of high-level design choices and design space explorations. Currently, McPAT is the de facto power model, but the literature does not yet contain a careful examination of its modeling accuracy. In addition, the issue of how greatly power modeling error can affect architectural-level studies has not been quantified before. In this work, we present the first rigorous assessment of McPAT’s core power and area models with a detailed, validated power modeling toolchain used in current industrial practice. We find that McPAT’s predictions can have significant error because some of the models are either incomplete, too high-level, or assume implementations of structures that differ from that of the core at hand. We demonstrate that large errors are possible when using McPAT’s dynamic power estimates in the context of voltage noise and thermal hotspots, but for steady-state properties, accurately modeling leakage power is more important. Based on our analysis, we are able to provide guidelines for creating accurate McPAT models, even without access to detailed industrial power modeling tools. We conclude that in spite of its accuracy gaps, McPAT is still a very useful tool for many architectural studies, and its limitations can often be adequately addressed for a given research study of interest. |
Simone Campanoni; Glenn Holloway; Gu-Yeon Wei; David Brooks HELIX-UP: Relaxing Program Semantics to Unleash Parallelization Conference International Symposium on Code Generation and Optimization (CGO), 2015. @conference{Campanoni2015_cgo, title = {HELIX-UP: Relaxing Program Semantics to Unleash Parallelization}, author = {Simone Campanoni and Glenn Holloway and Gu-Yeon Wei and David Brooks}, url = {http://www.eecs.harvard.edu/~xan/lib/exe/fetch.php?media=research:cgo2015_paper.pdf}, year = {2015}, date = {2015-02-07}, booktitle = {International Symposium on Code Generation and Optimization (CGO)}, abstract = {Automatic generation of parallel code for general-purpose commodity processors is a challenging computational problem. Nevertheless, there is a lot of latent thread-level parallelism in the way sequential programs are actually used. To convert latent parallelism into performance gains, users may be willing to compromise on the quality of a program\'s results. We have developed a parallelizing compiler and runtime that substantially improve scalability by allowing parallelized code to briefly sidestep strict adherence to language semantics at run time. In addition to boosting performance, our approach limits the sensitivity of parallelized code to the parameters of target CPUs (such as core-to-core communication latency) and the accuracy of data dependence analysis.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Automatic generation of parallel code for general-purpose commodity processors is a challenging computational problem. Nevertheless, there is a lot of latent thread-level parallelism in the way sequential programs are actually used. To convert latent parallelism into performance gains, users may be willing to compromise on the quality of a program's results. We have developed a parallelizing compiler and runtime that substantially improve scalability by allowing parallelized code to briefly sidestep strict adherence to language semantics at run time. In addition to boosting performance, our approach limits the sensitivity of parallelized code to the parameters of target CPUs (such as core-to-core communication latency) and the accuracy of data dependence analysis. |
Yakun Sophia Shao; Sam Xi; Viji Srinivasan; Gu-Yeon Wei; David Brooks Toward Cache-Friendly Hardware Accelerators Conference HPCA Sensors and Cloud Architectures Workshop (SCAW), 2015. @conference{Shao2015_scaw, title = {Toward Cache-Friendly Hardware Accelerators}, author = {Yakun Sophia Shao and Sam Xi and Viji Srinivasan and Gu-Yeon Wei and David Brooks}, url = {http://www.eecs.harvard.edu/~shao/papers/shao2015-scaw.pdf}, year = {2015}, date = {2015-02-07}, booktitle = {HPCA Sensors and Cloud Architectures Workshop (SCAW)}, abstract = {Increasing demand for power-efficient, high-performance computing has spurred a growing number and diversity of hardware accelerators in mobile Systems on Chip (SoCs) as well as servers and desktops. Despite their energy efficiency, fixed-function accelerators lack programmability, especially compared with general-purpose processors. Today’s accelerators rely on software-managed scratchpad memory and Direct Memory Access (DMA) to provide fixed-latency memory access and data transfer, which leads to significant chip resource and software engineering costs. On the other hand, hardware-managed caches with support for virtual memory and cache coherence are well-known to ease programmability in general-purpose processors, but these features are not commonly supported in today’s fixed-function accelerators. As a first step toward cache-friendly accelerator design, this paper discusses limitations of scratchpad-based memories in today’s accelerators, identifies challenges to support hardware-managed caches, and explores opportunities to ease the cache integration.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Increasing demand for power-efficient, high-performance computing has spurred a growing number and diversity of hardware accelerators in mobile Systems on Chip (SoCs) as well as servers and desktops. Despite their energy efficiency, fixed-function accelerators lack programmability, especially compared with general-purpose processors. Today’s accelerators rely on software-managed scratchpad memory and Direct Memory Access (DMA) to provide fixed-latency memory access and data transfer, which leads to significant chip resource and software engineering costs. On the other hand, hardware-managed caches with support for virtual memory and cache coherence are well-known to ease programmability in general-purpose processors, but these features are not commonly supported in today’s fixed-function accelerators. As a first step toward cache-friendly accelerator design, this paper discusses limitations of scratchpad-based memories in today’s accelerators, identifies challenges to support hardware-managed caches, and explores opportunities to ease the cache integration. |
Brandon Reagen; Robert Adolf; Gu-Yeon Wei; David Brooks The MachSuite Benchmark Conference Boston Area Architecture Workshop (BARC), 2015. @conference{Reagen_MSBARC_2015, title = {The MachSuite Benchmark}, author = {Brandon Reagen and Robert Adolf and Gu-Yeon Wei and David Brooks}, url = {http://www.eecs.harvard.edu/~reagen/papers/ms_barc.pdf}, year = {2015}, date = {2015-01-30}, booktitle = {Boston Area Architecture Workshop (BARC)}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |