@article{cascavalTaxonomyAcceleratorArchitectures2010, abstract = {As the clock frequency of silicon chips is leveling off, the computer architecture community is looking for different solutions to continue application performance scaling. One such solution is the multicore approach, i.e., using multiple simple cores that enable higher performance than wide superscalar processors, provided that the workload can exploit the parallelism. Another emerging alternative is the use of customized designs (accelerators) at different levels within the system. These are specialized functional units integrated with the core, specialized cores, attached processors, or attached appliances. The design tradeoff is quite compelling because current processor chips have billions of transistors, but they cannot all be activated or switched at the same time at high frequencies. Specialized designs provide increased power efficiency but cannot be used as general-purpose compute engines. Therefore, architects trade area for power efficiency by placing in the design additional units that are known to be active at different times. The resulting system is a heterogeneous architecture, with the potential of specialized execution that accelerates different workloads. While designing and building such hardware systems is attractive, writing and porting software to a heterogeneous platform is even more challenging than parallelism for homogeneous multicore systems. In this paper, we propose a taxonomy that allows us to define classes of accelerators, with the goal of focusing on a small set of programming models for accelerators. We discuss several types of currently popular accelerators and identify challenges to exploiting such accelerators in current software stacks. This paper serves as a guide for both hardware designers by providing them with a view on how software best exploits specialization and software programmers by focusing research efforts to address parallelism and heterogeneity.}, author = {Cascaval, C. and Chatterjee, S. and Franke, H. and Gildea, K. J. and Pattnaik, P.}, doi = {10.1147/JRD.2010.2059721}, file = {/Users/ashriram/Zotero/storage/W8K2YRKL/Cascaval et al. - 2010 - A taxonomy of accelerator architectures and their .pdf}, issn = {0018-8646}, journal = {IBM Journal of Research and Development}, month = sep, number = {5}, pages = {5:1-5:10}, title = {A Taxonomy of Accelerator Architectures and Their Programming Models}, volume = {54}, note = {Week9, DSA-I}, year = {2010} } @article{mittalSurveyTechniquesDynamic2018, title = {A {{Survey}} of {{Techniques}} for {{Dynamic Branch Prediction}}}, author = {Mittal, Sparsh}, year = {2018}, month = apr, journal = {arXiv:1804.00261 [cs]}, eprint = {1804.00261}, eprinttype = {arxiv}, primaryclass = {cs}, abstract = {Branch predictor (BP) is an essential component in modern processors since high BP accuracy can improve performance and reduce energy by decreasing the number of instructions executed on wrong-path. However, reducing latency and storage overhead of BP while maintaining high accuracy presents significant challenges. In this paper, we present a survey of dynamic branch prediction techniques. We classify the works based on key features to underscore their differences and similarities. We believe this paper will spark further research in this area and will be useful for computer architects, processor designers and researchers.}, archiveprefix = {arXiv}, note = {Branch predictor, Week2} } @article{linBranchPredictionNot2019, title = {Branch {{Prediction Is Not}} a {{Solved Problem}}: {{Measurements}}, {{Opportunities}}, and {{Future Directions}}}, shorttitle = {Branch {{Prediction Is Not}} a {{Solved Problem}}}, author = {Lin, Chit-Kwan and Tarsa, Stephen J.}, year = {2019}, month = nov, journal = {2019 IEEE International Symposium on Workload Characterization (IISWC)}, eprint = {1906.08170}, eprinttype = {arxiv}, pages = {228--238}, doi = {10.1109/IISWC47752.2019.9042108}, abstract = {Modern branch predictors predict the vast majority of conditional branch instructions with near-perfect accuracy, allowing superscalar, out-of-order processors to maximize speculative efficiency and thus performance. However, this impressive overall effectiveness belies a substantial missed opportunity in single-threaded instructions per cycle (IPC). For example, we show that correcting the mispredictions made by the state-of-the-art TAGE-SC-L branch predictor on SPECint 2017 would improve IPC by margins similar to an advance in process technology node. In this work, we measure and characterize these mispredictions. We find that they categorically arise from either (1) a small number of systematically hard-to-predict (H2P) branches; or (2) rare branches with low dynamic execution counts. Using data from SPECint 2017 and additional large code footprint applications, we quantify the occurrence and IPC impact of these two categories. We then demonstrate that increasing the resources afforded to existing branch predictors does not alone address the root causes of most mispredictions. This leads us to reexamine basic assumptions in branch prediction and to propose new research directions that, for example, deploy machine learning to improve pattern matching for H2Ps, and use on-chip phase learning to track long-term statistics for rare branches.}, archiveprefix = {arXiv}, note = {Week2, Branch predictor} } @inproceedings{parashar-isca-2013, author = {Parashar, Angshuman and Pellauer, Michael and Adler, Michael and Ahsan, Bushra and Crago, Neal and Lustig, Daniel and Pavlov, Vladimir and Zhai, Antonia and Gambhir, Mohit and Jaleel, Aamer and Allmon, Randy and Rayess, Rachid and Maresh, Stephen and Emer, Joel}, title = {{Triggered instructions: a control paradigm for spatially-programmed architectures}}, booktitle = {PROC of the 40th ISCA}, year = {2013}, pages = {1--12}, month = apr, note = {Dataflow, Week9} } @inproceedings{swanson-micro-2003, author = {Swanson, Steven and Michelson, Ken and Schwerin, Andrew and Oskin, Mark}, title = {{WaveScalar}}, booktitle = {PROC of the 36th MICRO}, year = {2003}, note = {Dataflow} } @inproceedings{gebhart-asplos-2009, author = {Gebhart, Mark and Maher, Bertrand A and Coons, Katherine E and Diamond, Jeff and Gratz, Paul and Marino, Mario and Ranganathan, Nitya and Robatmili, Behnam and Smith, Aaron and Burrill, James and Keckler, Stephen W and Burger, Doug and McKinley, Kathryn S}, title = {{An evaluation of the TRIPS computer system}}, booktitle = {PROC of the 14th ASPLOS}, year = {2009}, note = {Dataflow, Week9} } @inproceedings{taylor-isca-2004, author = {Taylor, Michael Bedford and Lee, Walter and Miller, Jason and Wentzlaff, David and Bratt, Ian and Ben Greenwald and Hoffmann, Henry and Johnson, Paul and Kim, Jason and Psota, James and Saraf, Arvind and Shnidman, Nathan and Strumpen, Volker and Frank, Matt and Amarasinghe, Saman and Agarwal, Anant}, title = {{Evaluation of th8 Raw Microprocessor: An Exposed-Wire-Delay Architecture for ILP and Streams}}, booktitle = {PROC of the 31st ISCA}, year = {2004}, note = {DSA, Week9} } @inproceedings{hill-hpca-2008, author = {Hill, M D}, title = {{Amdahl's La8 in the multicore era}}, booktitle = {PROC of the 14th HPCA}, year = {2008}, note = {Technology, Week8} } @inproceedings{esmaeilzadeh-isca-2011, author = {Esmaeilzadeh, Hadi and Blem, Emily and Amant, Renee St and Sankaralingam, Karthikeyan and Burger, Doug}, title = {{Dark silicon and the end of multicore scaling}}, booktitle = {Proc. of the 38th ISCA}, year = {2011}, note = {power, Week8} } @inproceedings{venkatesh-asplos-2010, author = {Venkatesh, Ganesh and Sampson, Jack and Goulding, Nathan and Garcia, Saturnino and Bryksin, Vladyslav and Lugo-Martinez, Jose and Swanson, Steven and Taylor, Michael Bedford}, title = {{Conservation cores: reducing the energy of mature computations}}, booktitle = {PROC of the 15th ASPLOS}, year = {2010}, note = {DSAI, Week9} } @inproceedings{sharifian-micro-2016, author = {Sharifian, Amirali and Kumar, Snehasish and Guha, Apala and Shriraman, Arrvindh}, title = {{Chainsaw: Von-neumann accelerators to leverage fused instruction chains.}}, booktitle = {Proc. of the 49th MICRO}, year = {2016}, pages = {1--14}, note = {DSAI, Week9} } @article{martin-cacm-2012, author = {Martin, Milo M K and Hill, Mark D and Sorin, Daniel J}, title = {{Why On-Chip Cache Coherence is Here to Stay}}, journal = {Communications of ACM}, year = {2012}, note = {Coherence, Week7} } @inproceedings{martin-ieeemicro-2003, author = {Martin, M M K and Hill, M D and Wood, D A}, title = {{Token coherence: a new framework for shared-memory multiprocessors}}, booktitle = {Micro, ieee}, year = {2003}, month = nov, note = {Coherence, Week7} } @inproceedings{gharachorloo-lenoski-isca-1998, author = {Gharachorloo, Kourosh and Lenoski, Daniel and Laudon, James and Gibbons, Phillip and Gupta, Anoop and Hennessy, John}, title = {{Memory consistency and event ordering in scalable shared-memory multiprocessors}}, booktitle = {25 years of the international symposia on computer architecture (selected papers)}, year = {1998}, note = {Consistency, Week11} } @inproceedings{boehm-pldi-2008, author = {Boehm, Hans-J and Adve, Sarita V}, title = {{Foundations of the C++ concurrency memory model}}, booktitle = {PROC of the 2008 PLDI}, year = {2008}, note = {Consistency, Week11} } @inproceedings{shriraman-isca-2007, author = {Shriraman, Arrvindh and Spear, Michael F and Hossain, Hemayet and Marathe, Virendra J and Dwarkadas, Sandhya and Scott, Michael L}, title = {{An integrated hardware-software approach to flexible transactional memory}}, booktitle = {ISCA '07: Proceedings of the 34th annual international symposium on Computer architecture}, year = {2007}, publisher = {~ACM Request Permissions}, month = jun, note = {Parallel} } @article{mudge-power, author = {Mudge, T.}, journal = {Computer}, title = {Power: a first-class architectural design constraint}, year = {2001}, volume = {34}, number = {4}, pages = {52-58}, note = {power, Week8} } @inproceedings{turakhia-asplos-2018, author = {Turakhia, Yatish and Bejerano, Gill and Dally, William J}, title = {{Darwin: A Genomics Co-processor Provides up to 15, 000X Acceleration on Long Read Assembly.}}, booktitle = {Proc. of the 23rd ASPLOS}, year = {2018}, pages = {199--213}, note = {DSA-II, Week10} } @inproceedings{chen-isca-2016, author = {Chen, Yu-Hsin and Emer, Joel S and Sze, Vivienne}, title = {{Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks.}}, booktitle = {Proc. of the 43rd ISCA}, year = {2016}, pages = {367--379}, note = {DSA-II, Week10} } @inproceedings{putnam-isca-2014, author = {Putnam, Andrew and Caulfield, Adrian M and Chung, Eric S and Chiou, Derek and Constantinides, Kypros and Demme, John and Esmaeilzadeh, Hadi and Fowers, Jeremy and Gopal, Gopi Prashanth and Gray, Jan and Haselman, Michael and Hauck, Scott and Heil, Stephen and Hormati, Amir and Kim, Joo-Young and Lanka, Sitaram and Larus, James R and Peterson, Eric and Pope, Simon and Smith, Aaron and Thong, Jason and Xiao, Phillip Yi and Burger, Doug}, title = {{A reconfigurable fabric for accelerating large-scale datacenter services.}}, booktitle = {Proc. of the 41st ISCA}, year = {2014}, pages = {13--24}, note = {DSAI, Week9, Extra} } @inproceedings{nowatzki-isca-2017, author = {Nowatzki, Tony and Gangadhar, Vinay and Ardalani, Newsha and Sankaralingam, Karthikeyan}, title = {{Stream-Dataflow Acceleration.}}, booktitle = {Proc. of the 44th ISCA}, year = {2017}, pages = {416--429}, note = {DSAI, Week9, Extra} } @inproceedings{Kocher-security-2019, author = {Kocher, Paul and Horn, Jann and Fogh, Anders and Genkin, Daniel and Gruss, Daniel and Haas, Werner and Hamburg, Mike and Lipp, Moritz and Mangard, Stefan and Prescher, Thomas and Schwarz, Michael and Yarom, Yuval}, booktitle = {2019 IEEE Symposium on Security and Privacy (SP)}, title = {Spectre Attacks: Exploiting Speculative Execution}, year = {2019}, volume = {}, number = {}, pages = {1-19}, note = {Security} } @inproceedings{yu-micro-2019, author = {Yu, Jiyong and Yan, Mengjia and Khyzha, Artem and 0001, Adam Morrison and Torrellas, Josep and Fletcher, Christopher W}, title = {{Speculative Taint Tracking (STT): A Comprehensive Protection for Speculatively Accessed Data.}}, booktitle = {Proc. of the 52nd MICRO}, year = {2019}, pages = {954--968}, note = {Security} } @inproceedings{sohi-isca-1995, author = {Sohi, Gurindar S and Breach, Scott E and Vijaykumar, T N}, title = {{Multiscalar processors}}, booktitle = {PROC of the 22nd ISCA}, year = {1995}, note = {Week12, Multithreading} } @inproceedings{tullsen-isca-1995, author = {Tullsen, Dean M and Eggers, Susan J and Levy, Henry M}, title = {{Simultaneous Multithreading: Maximizing On-Chip Parallelism}}, booktitle = {PROC of the 22nd ISCA}, year = {1995}, note = {Week12, Multithreading} } @article{dennard-1967, author = {Dennard, R.H. and Gaensslen, F.H. and Yu, Hwa-Nien and Rideout, V.L. and Bassous, E. and LeBlanc, A.R.}, journal = {IEEE Journal of Solid-State Circuits}, title = {Design of ion-implanted MOSFET's with very small physical dimensions}, year = {1974}, volume = {9}, number = {5}, pages = {256-268}, note = {Week8, Technology} } @article{gochman-itj-2003, author = {Gochman, Simcha and Ronen, Ronny and Anati, Ittai and Berkovits, Ariel and Kurts, Tsvika and Naveh, Alon and Saeed, Ali and Sperber, Zeev and Valentine, Robert C.}, journal = {Intel Technology Journal}, title = {The Intel Pentium M Processor: Microarchitecture and Performance}, year = {2003}, volume = {7}, number = {2}, pages = {21-35}, note = {Week8, Technology} } @article{superscalar1, author = {Smith, J.E. and Sohi, G.S.}, journal = {Proceedings of the IEEE}, title = {The microarchitecture of superscalar processors}, year = {1995}, volume = {83}, number = {12}, pages = {1609-1624}, note = {Week1, OOO} } @inproceedings{palacharla-isca-1997, author = {Palacharla, Subbarao and Jouppi, Norman P and Smith, J E}, title = {{Complexity-effective superscalar processors}}, booktitle = {PROC of the 24th ISCA}, year = {1997}, note = {Week3, OOO} } @inproceedings{chrysos-isca-1998, author = {Chrysos, George Z and Emer, Joel S}, title = {{Memory Dependence Prediction Using Store Sets}}, booktitle = {PROC of the 25th ISCA}, year = {1998}, note = {Week3, OOO} } @inproceedings{sethumadhavan-isca-2007, author = {Sethumadhavan, Simha and Roesner, Franziska and Emer, Joel S and Burger, Doug and Keckler, Stephen W}, title = {{Late-binding: enabling unordered load-store queues}}, booktitle = {PROC of the 34th ISCA}, year = {2007}, note = {Week3, OOO} } @inproceedings{vedula-hpca-2018, author = {Naveen Vedula and Arrvindh Shriraman and Snehasish Kumar and Nick Sumner}, title = {NACHOS: Software-Driven Hardware-Assisted Memory Disambiguation for Accelerators}, booktitle = {Proc. of the 19th Intl. Symp. on High Performance Computer Architecture}, series = {HPCA}, note = {Week9, DSAI}, year = {2018} } @article{baugh_lsq, author = {Baugh, L. and Zilles, C.}, title = {Decomposing the Load-Store Queue by Function for Power Reduction and Scalability}, year = {2006}, issue_date = {March 2006}, publisher = {IBM Corp.}, address = {USA}, volume = {50}, number = {2/3}, issn = {0018-8646}, journal = {IBM J. Res. Dev.}, month = ma8, pages = {287–297}, numpages = {11}, note = {Week3, LSQ} } @inproceedings{olukotun-asplos-1996, author = {Olukotun, Kunle and Nayfeh, Basem A and Hammond, Lance and Wilson, Ken and Chang, Kunyung}, title = {{The case for a single-chip multiprocessor}}, booktitle = {PROC of the 7th ASPLOS}, year = {1996}, note = {Multicore, Week6} } @misc{mutluMainmemorysystemKiise15Pdf2015, title = {Main-Memory-System\_kiise15.Pdf}, author = {Mutlu, Onur}, year = {2015}, note = {DRAM, Week11} } @inproceedings{zhaoCOBRAFrameworkEvaluating2021, title = {{{COBRA}}: {{A Framework}} for {{Evaluating Compositions}} of {{Hardware Branch Predictors}}}, shorttitle = {{{COBRA}}}, booktitle = {2021 {{IEEE International Symposium}} on {{Performance Analysis}} of {{Systems}} and {{Software}} ({{ISPASS}})}, author = {Zhao, Jerry and Gonzalez, Abraham and Amid, Alon and Karandikar, Sagar and Asanovi{\'c}, Krste}, year = {2021}, month = mar, pages = {310--320}, note = {Week2, Extra, Branch predictor} } @inproceedings{singh-hpca-2013, title = {Cache Coherence for {{GPU}} Architectures.}, booktitle = {Proc. of the 19th {{HPCA}}}, author = {Singh, Inderpreet and Shriraman, Arrvindh and Fung, Wilson W L and O'Connor, Mike and Aamodt, Tor M}, year = {2013}, pages = {578--590}, note = {Coherence,Week7} } @article{bedfordtaylorEvolutionBitcoinHardware2017, title = {The {{Evolution}} of {{Bitcoin Hardware}}}, author = {Bedford Taylor, Michael}, year = {2017}, volume = {50}, pages = {58--66}, journal = {Computer}, number = {9}, note = {Technology,Week8} } @article{adve-tr-1995, author = {Adve, Sarita and Gharachorloo, Kourosh}, journal = {Rice and WRL Technical Report}, title = {Shared Memory Consistency Models: A Tutorial}, year = {1995}, note = {Week11, Memory} }