accounts_db.rs 317 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417
  1. //! Persistent accounts are stored at this path location:
  2. //! `<path>/<pid>/data/`
  3. //!
  4. //! The persistent store would allow for this mode of operation:
  5. //! - Concurrent single thread append with many concurrent readers.
  6. //!
  7. //! The underlying memory is memory mapped to a file. The accounts would be
  8. //! stored across multiple files and the mappings of file and offset of a
  9. //! particular account would be stored in a shared index. This will allow for
  10. //! concurrent commits without blocking reads, which will sequentially write
  11. //! to memory, ssd or disk, and should be as fast as the hardware allow for.
  12. //! The only required in memory data structure with a write lock is the index,
  13. //! which should be fast to update.
  14. //!
  15. //! [`AppendVec`]'s only store accounts for single slots. To bootstrap the
  16. //! index from a persistent store of [`AppendVec`]'s, the entries include
  17. //! a "write_version". A single global atomic `AccountsDb::write_version`
  18. //! tracks the number of commits to the entire data store. So the latest
  19. //! commit for each slot entry would be indexed.
  20. mod accounts_db_config;
  21. mod geyser_plugin_utils;
  22. pub mod stats;
  23. pub mod tests;
  24. pub use accounts_db_config::{
  25. AccountsDbConfig, ACCOUNTS_DB_CONFIG_FOR_BENCHMARKS, ACCOUNTS_DB_CONFIG_FOR_TESTING,
  26. };
  27. #[cfg(feature = "dev-context-only-utils")]
  28. use qualifier_attr::qualifiers;
  29. use {
  30. crate::{
  31. account_info::{AccountInfo, Offset, StorageLocation},
  32. account_storage::{
  33. stored_account_info::{StoredAccountInfo, StoredAccountInfoWithoutData},
  34. AccountStorage, AccountStoragesOrderer, ShrinkInProgress,
  35. },
  36. accounts_cache::{AccountsCache, CachedAccount, SlotCache},
  37. accounts_db::stats::{
  38. AccountsStats, CleanAccountsStats, FlushStats, ObsoleteAccountsStats, PurgeStats,
  39. ShrinkAncientStats, ShrinkStats, ShrinkStatsSub, StoreAccountsTiming,
  40. },
  41. accounts_file::{AccountsFile, AccountsFileError, AccountsFileProvider, StorageAccess},
  42. accounts_hash::{AccountLtHash, AccountsLtHash, ZERO_LAMPORT_ACCOUNT_LT_HASH},
  43. accounts_index::{
  44. in_mem_accounts_index::StartupStats, AccountSecondaryIndexes, AccountsIndex,
  45. AccountsIndexRootsStats, AccountsIndexScanResult, IndexKey, IsCached, ReclaimsSlotList,
  46. RefCount, ScanConfig, ScanFilter, ScanResult, SlotList, Startup, UpsertReclaim,
  47. },
  48. accounts_update_notifier_interface::{AccountForGeyser, AccountsUpdateNotifier},
  49. active_stats::{ActiveStatItem, ActiveStats},
  50. ancestors::Ancestors,
  51. append_vec::{self, aligned_stored_size, STORE_META_OVERHEAD},
  52. contains::Contains,
  53. is_zero_lamport::IsZeroLamport,
  54. obsolete_accounts::ObsoleteAccounts,
  55. partitioned_rewards::PartitionedEpochRewardsConfig,
  56. read_only_accounts_cache::ReadOnlyAccountsCache,
  57. storable_accounts::{StorableAccounts, StorableAccountsBySlot},
  58. u64_align,
  59. utils::{self, create_account_shared_data},
  60. },
  61. agave_fs::buffered_reader::RequiredLenBufFileRead,
  62. dashmap::{DashMap, DashSet},
  63. log::*,
  64. rand::{thread_rng, Rng},
  65. rayon::{prelude::*, ThreadPool},
  66. seqlock::SeqLock,
  67. smallvec::SmallVec,
  68. solana_account::{Account, AccountSharedData, ReadableAccount},
  69. solana_clock::{BankId, Epoch, Slot},
  70. solana_epoch_schedule::EpochSchedule,
  71. solana_lattice_hash::lt_hash::LtHash,
  72. solana_measure::{measure::Measure, measure_us},
  73. solana_nohash_hasher::{BuildNoHashHasher, IntMap, IntSet},
  74. solana_pubkey::Pubkey,
  75. solana_rayon_threadlimit::get_thread_count,
  76. solana_transaction::sanitized::SanitizedTransaction,
  77. std::{
  78. borrow::Cow,
  79. boxed::Box,
  80. collections::{BTreeSet, HashMap, HashSet, VecDeque},
  81. io, iter, mem,
  82. num::Saturating,
  83. ops::RangeBounds,
  84. path::{Path, PathBuf},
  85. sync::{
  86. atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize, Ordering},
  87. Arc, Condvar, Mutex, RwLock, RwLockReadGuard,
  88. },
  89. thread::{self, sleep},
  90. time::{Duration, Instant},
  91. },
  92. tempfile::TempDir,
  93. };
  94. // when the accounts write cache exceeds this many bytes, we will flush it
  95. // this can be specified on the command line, too (--accounts-db-cache-limit-mb)
  96. const WRITE_CACHE_LIMIT_BYTES_DEFAULT: u64 = 15_000_000_000;
  97. const SCAN_SLOT_PAR_ITER_THRESHOLD: usize = 4000;
  98. const UNREF_ACCOUNTS_BATCH_SIZE: usize = 10_000;
  99. const DEFAULT_FILE_SIZE: u64 = 4 * 1024 * 1024;
  100. const DEFAULT_NUM_DIRS: u32 = 4;
  101. // This value reflects recommended memory lock limit documented in the validator's
  102. // setup instructions at docs/src/operations/guides/validator-start.md allowing use of
  103. // several io_uring instances with fixed buffers for large disk IO operations.
  104. pub const DEFAULT_MEMLOCK_BUDGET_SIZE: usize = 2_000_000_000;
  105. // Linux distributions often have some small memory lock limit (e.g. 8MB) that we can tap into.
  106. const MEMLOCK_BUDGET_SIZE_FOR_TESTS: usize = 4_000_000;
  107. // When getting accounts for shrinking from the index, this is the # of accounts to lookup per thread.
  108. // This allows us to split up accounts index accesses across multiple threads.
  109. const SHRINK_COLLECT_CHUNK_SIZE: usize = 50;
  110. /// The number of shrink candidate slots that is small enough so that
  111. /// additional storages from ancient slots can be added to the
  112. /// candidates for shrinking.
  113. const SHRINK_INSERT_ANCIENT_THRESHOLD: usize = 10;
  114. #[derive(Debug, Clone, Copy, PartialEq, Eq)]
  115. pub(crate) enum ScanAccountStorageData {
  116. /// callback for accounts in storage will not include `data`
  117. // Note, currently only used in tests, but do not remove.
  118. #[cfg_attr(not(test), allow(dead_code))]
  119. NoData,
  120. /// return data (&[u8]) for each account.
  121. /// This can be expensive to get and is not necessary for many scan operations.
  122. DataRefForStorage,
  123. }
  124. #[derive(Default, Debug)]
  125. /// hold alive accounts
  126. /// alive means in the accounts index
  127. pub(crate) struct AliveAccounts<'a> {
  128. /// slot the accounts are currently stored in
  129. pub(crate) slot: Slot,
  130. pub(crate) accounts: Vec<&'a AccountFromStorage>,
  131. pub(crate) bytes: usize,
  132. }
  133. /// separate pubkeys into those with a single refcount and those with > 1 refcount
  134. #[derive(Debug)]
  135. pub(crate) struct ShrinkCollectAliveSeparatedByRefs<'a> {
  136. /// accounts where ref_count = 1
  137. pub(crate) one_ref: AliveAccounts<'a>,
  138. /// account where ref_count > 1, but this slot contains the alive entry with the highest slot
  139. pub(crate) many_refs_this_is_newest_alive: AliveAccounts<'a>,
  140. /// account where ref_count > 1, and this slot is NOT the highest alive entry in the index for the pubkey
  141. pub(crate) many_refs_old_alive: AliveAccounts<'a>,
  142. }
  143. pub(crate) trait ShrinkCollectRefs<'a>: Sync + Send {
  144. fn with_capacity(capacity: usize, slot: Slot) -> Self;
  145. fn collect(&mut self, other: Self);
  146. fn add(
  147. &mut self,
  148. ref_count: RefCount,
  149. account: &'a AccountFromStorage,
  150. slot_list: &[(Slot, AccountInfo)],
  151. );
  152. fn len(&self) -> usize;
  153. fn alive_bytes(&self) -> usize;
  154. fn alive_accounts(&self) -> &Vec<&'a AccountFromStorage>;
  155. }
  156. impl<'a> ShrinkCollectRefs<'a> for AliveAccounts<'a> {
  157. fn collect(&mut self, mut other: Self) {
  158. self.bytes = self.bytes.saturating_add(other.bytes);
  159. self.accounts.append(&mut other.accounts);
  160. }
  161. fn with_capacity(capacity: usize, slot: Slot) -> Self {
  162. Self {
  163. accounts: Vec::with_capacity(capacity),
  164. bytes: 0,
  165. slot,
  166. }
  167. }
  168. fn add(
  169. &mut self,
  170. _ref_count: RefCount,
  171. account: &'a AccountFromStorage,
  172. _slot_list: &[(Slot, AccountInfo)],
  173. ) {
  174. self.accounts.push(account);
  175. self.bytes = self.bytes.saturating_add(account.stored_size());
  176. }
  177. fn len(&self) -> usize {
  178. self.accounts.len()
  179. }
  180. fn alive_bytes(&self) -> usize {
  181. self.bytes
  182. }
  183. fn alive_accounts(&self) -> &Vec<&'a AccountFromStorage> {
  184. &self.accounts
  185. }
  186. }
  187. impl<'a> ShrinkCollectRefs<'a> for ShrinkCollectAliveSeparatedByRefs<'a> {
  188. fn collect(&mut self, other: Self) {
  189. self.one_ref.collect(other.one_ref);
  190. self.many_refs_this_is_newest_alive
  191. .collect(other.many_refs_this_is_newest_alive);
  192. self.many_refs_old_alive.collect(other.many_refs_old_alive);
  193. }
  194. fn with_capacity(capacity: usize, slot: Slot) -> Self {
  195. Self {
  196. one_ref: AliveAccounts::with_capacity(capacity, slot),
  197. many_refs_this_is_newest_alive: AliveAccounts::with_capacity(0, slot),
  198. many_refs_old_alive: AliveAccounts::with_capacity(0, slot),
  199. }
  200. }
  201. fn add(
  202. &mut self,
  203. ref_count: RefCount,
  204. account: &'a AccountFromStorage,
  205. slot_list: &[(Slot, AccountInfo)],
  206. ) {
  207. let other = if ref_count == 1 {
  208. &mut self.one_ref
  209. } else if slot_list.len() == 1
  210. || !slot_list
  211. .iter()
  212. .any(|(slot_list_slot, _info)| slot_list_slot > &self.many_refs_old_alive.slot)
  213. {
  214. // this entry is alive but is newer than any other slot in the index
  215. &mut self.many_refs_this_is_newest_alive
  216. } else {
  217. // This entry is alive but is older than at least one other slot in the index.
  218. // We would expect clean to get rid of the entry for THIS slot at some point, but clean hasn't done that yet.
  219. &mut self.many_refs_old_alive
  220. };
  221. other.add(ref_count, account, slot_list);
  222. }
  223. fn len(&self) -> usize {
  224. self.one_ref
  225. .len()
  226. .saturating_add(self.many_refs_old_alive.len())
  227. .saturating_add(self.many_refs_this_is_newest_alive.len())
  228. }
  229. fn alive_bytes(&self) -> usize {
  230. self.one_ref
  231. .alive_bytes()
  232. .saturating_add(self.many_refs_old_alive.alive_bytes())
  233. .saturating_add(self.many_refs_this_is_newest_alive.alive_bytes())
  234. }
  235. fn alive_accounts(&self) -> &Vec<&'a AccountFromStorage> {
  236. unimplemented!("illegal use");
  237. }
  238. }
  239. pub enum StoreReclaims {
  240. /// normal reclaim mode
  241. Default,
  242. /// do not return reclaims from accounts index upsert
  243. Ignore,
  244. }
  245. /// specifies how to return zero lamport accounts from a load
  246. #[derive(Debug, Clone, Copy, PartialEq, Eq)]
  247. enum LoadZeroLamports {
  248. /// return None if loaded account has zero lamports
  249. None,
  250. /// return Some(account with zero lamports) if loaded account has zero lamports
  251. /// This used to be the only behavior.
  252. /// Note that this is non-deterministic if clean is running asynchronously.
  253. /// If a zero lamport account exists in the index, then Some is returned.
  254. /// Once it is cleaned from the index, None is returned.
  255. #[cfg(feature = "dev-context-only-utils")]
  256. SomeWithZeroLamportAccountForTests,
  257. }
  258. #[derive(Debug)]
  259. pub(crate) struct ShrinkCollect<'a, T: ShrinkCollectRefs<'a>> {
  260. pub(crate) slot: Slot,
  261. pub(crate) capacity: u64,
  262. pub(crate) pubkeys_to_unref: Vec<&'a Pubkey>,
  263. pub(crate) zero_lamport_single_ref_pubkeys: Vec<&'a Pubkey>,
  264. pub(crate) alive_accounts: T,
  265. /// total size in storage of all alive accounts
  266. pub(crate) alive_total_bytes: usize,
  267. pub(crate) total_starting_accounts: usize,
  268. /// true if all alive accounts are zero lamports
  269. pub(crate) all_are_zero_lamports: bool,
  270. }
  271. struct LoadAccountsIndexForShrink<'a, T: ShrinkCollectRefs<'a>> {
  272. /// all alive accounts
  273. alive_accounts: T,
  274. /// pubkeys that are going to be unref'd in the accounts index after we are
  275. /// done with shrinking, because they are dead
  276. pubkeys_to_unref: Vec<&'a Pubkey>,
  277. /// pubkeys that are the last remaining zero lamport instance of an account
  278. zero_lamport_single_ref_pubkeys: Vec<&'a Pubkey>,
  279. /// true if all alive accounts are zero lamport accounts
  280. all_are_zero_lamports: bool,
  281. }
  282. /// reference an account found during scanning a storage.
  283. #[derive(Debug, PartialEq, Copy, Clone)]
  284. pub struct AccountFromStorage {
  285. pub index_info: AccountInfo,
  286. pub data_len: u64,
  287. pub pubkey: Pubkey,
  288. }
  289. impl IsZeroLamport for AccountFromStorage {
  290. fn is_zero_lamport(&self) -> bool {
  291. self.index_info.is_zero_lamport()
  292. }
  293. }
  294. impl AccountFromStorage {
  295. pub fn pubkey(&self) -> &Pubkey {
  296. &self.pubkey
  297. }
  298. pub fn stored_size(&self) -> usize {
  299. aligned_stored_size(self.data_len as usize)
  300. }
  301. pub fn data_len(&self) -> usize {
  302. self.data_len as usize
  303. }
  304. #[cfg(test)]
  305. pub(crate) fn new(offset: Offset, account: &StoredAccountInfoWithoutData) -> Self {
  306. // the id is irrelevant in this account info. This structure is only used DURING shrink operations.
  307. // In those cases, there is only 1 append vec id per slot when we read the accounts.
  308. // Any value of storage id in account info works fine when we want the 'normal' storage.
  309. let storage_id = 0;
  310. AccountFromStorage {
  311. index_info: AccountInfo::new(
  312. StorageLocation::AppendVec(storage_id, offset),
  313. account.is_zero_lamport(),
  314. ),
  315. pubkey: *account.pubkey(),
  316. data_len: account.data_len as u64,
  317. }
  318. }
  319. }
  320. pub struct GetUniqueAccountsResult {
  321. pub stored_accounts: Vec<AccountFromStorage>,
  322. pub capacity: u64,
  323. pub num_duplicated_accounts: usize,
  324. }
  325. pub struct AccountsAddRootTiming {
  326. pub index_us: u64,
  327. pub cache_us: u64,
  328. }
  329. /// Slots older the "number of slots in an epoch minus this number"
  330. /// than max root are treated as ancient and subject to packing.
  331. /// | older |<- slots in an epoch ->| max root
  332. /// | older |<- offset ->| |
  333. /// | ancient | modern |
  334. ///
  335. /// If this is negative, this many slots older than the number of
  336. /// slots in epoch are still treated as modern (ie. non-ancient).
  337. /// | older |<- abs(offset) ->|<- slots in an epoch ->| max root
  338. /// | ancient | modern |
  339. ///
  340. /// Note that another constant DEFAULT_MAX_ANCIENT_STORAGES sets a
  341. /// threshold for combining ancient storages so that their overall
  342. /// number is under a certain limit, whereas this constant establishes
  343. /// the distance from the max root slot beyond which storages holding
  344. /// the account data for the slots are considered ancient by the
  345. /// shrinking algorithm.
  346. const ANCIENT_APPEND_VEC_DEFAULT_OFFSET: Option<i64> = Some(100_000);
  347. /// The smallest size of ideal ancient storage.
  348. /// The setting can be overridden on the command line
  349. /// with --accounts-db-ancient-ideal-storage-size option.
  350. const DEFAULT_ANCIENT_STORAGE_IDEAL_SIZE: u64 = 100_000;
  351. /// Default value for the number of ancient storages the ancient slot
  352. /// combining should converge to.
  353. pub const DEFAULT_MAX_ANCIENT_STORAGES: usize = 100_000;
  354. #[cfg(not(test))]
  355. const ABSURD_CONSECUTIVE_FAILED_ITERATIONS: usize = 100;
  356. #[derive(Debug, Clone, Copy)]
  357. pub enum AccountShrinkThreshold {
  358. /// Measure the total space sparseness across all candidates
  359. /// And select the candidates by using the top sparse account storage entries to shrink.
  360. /// The value is the overall shrink threshold measured as ratio of the total live bytes
  361. /// over the total bytes.
  362. TotalSpace { shrink_ratio: f64 },
  363. /// Use the following option to shrink all stores whose alive ratio is below
  364. /// the specified threshold.
  365. IndividualStore { shrink_ratio: f64 },
  366. }
  367. pub const DEFAULT_ACCOUNTS_SHRINK_OPTIMIZE_TOTAL_SPACE: bool = true;
  368. pub const DEFAULT_ACCOUNTS_SHRINK_RATIO: f64 = 0.80;
  369. // The default extra account space in percentage from the ideal target
  370. const DEFAULT_ACCOUNTS_SHRINK_THRESHOLD_OPTION: AccountShrinkThreshold =
  371. AccountShrinkThreshold::TotalSpace {
  372. shrink_ratio: DEFAULT_ACCOUNTS_SHRINK_RATIO,
  373. };
  374. impl Default for AccountShrinkThreshold {
  375. fn default() -> AccountShrinkThreshold {
  376. DEFAULT_ACCOUNTS_SHRINK_THRESHOLD_OPTION
  377. }
  378. }
  379. pub enum ScanStorageResult<R, B> {
  380. Cached(Vec<R>),
  381. Stored(B),
  382. }
  383. #[derive(Debug)]
  384. pub struct IndexGenerationInfo {
  385. pub accounts_data_len: u64,
  386. /// The accounts lt hash calculated during index generation.
  387. /// Will be used when verifying accounts, after rebuilding a Bank.
  388. pub calculated_accounts_lt_hash: AccountsLtHash,
  389. }
  390. #[derive(Debug, Default)]
  391. struct SlotIndexGenerationInfo {
  392. insert_time_us: u64,
  393. num_accounts: u64,
  394. accounts_data_len: u64,
  395. zero_lamport_pubkeys: Vec<Pubkey>,
  396. all_accounts_are_zero_lamports: bool,
  397. /// Number of accounts in this slot that didn't already exist in the index
  398. num_did_not_exist: u64,
  399. /// Number of accounts in this slot that already existed, and were in-mem
  400. num_existed_in_mem: u64,
  401. /// Number of accounts in this slot that already existed, and were on-disk
  402. num_existed_on_disk: u64,
  403. /// The accounts lt hash *of only this slot*
  404. slot_lt_hash: SlotLtHash,
  405. /// The number of accounts in this slot that were skipped when generating the index as they
  406. /// were already marked obsolete in the account storage entry
  407. num_obsolete_accounts_skipped: u64,
  408. }
  409. /// The lt hash of old/duplicate accounts
  410. ///
  411. /// Accumulation of all the duplicate accounts found during index generation.
  412. /// These accounts need to have their lt hashes mixed *out*.
  413. /// This is the final value, that when applied to all the storages at startup,
  414. /// will produce the correct accounts lt hash.
  415. #[derive(Debug, Clone, Eq, PartialEq)]
  416. pub struct DuplicatesLtHash(pub LtHash);
  417. impl Default for DuplicatesLtHash {
  418. fn default() -> Self {
  419. Self(LtHash::identity())
  420. }
  421. }
  422. /// The lt hash of accounts in a single slot
  423. #[derive(Debug)]
  424. struct SlotLtHash(pub LtHash);
  425. impl Default for SlotLtHash {
  426. fn default() -> Self {
  427. Self(LtHash::identity())
  428. }
  429. }
  430. #[derive(Default, Debug)]
  431. struct GenerateIndexTimings {
  432. pub total_time_us: u64,
  433. pub index_time: u64,
  434. pub insertion_time_us: u64,
  435. pub storage_size_storages_us: u64,
  436. pub index_flush_us: u64,
  437. pub total_including_duplicates: u64,
  438. pub visit_duplicate_accounts_time_us: u64,
  439. pub total_duplicate_slot_keys: u64,
  440. pub total_num_unique_duplicate_keys: u64,
  441. pub num_duplicate_accounts: u64,
  442. pub populate_duplicate_keys_us: u64,
  443. pub total_slots: u64,
  444. pub visit_zero_lamports_us: u64,
  445. pub num_zero_lamport_single_refs: u64,
  446. pub all_accounts_are_zero_lamports_slots: u64,
  447. pub mark_obsolete_accounts_us: u64,
  448. pub num_obsolete_accounts_marked: u64,
  449. pub num_slots_removed_as_obsolete: u64,
  450. pub num_obsolete_accounts_skipped: u64,
  451. }
  452. #[derive(Default, Debug, PartialEq, Eq)]
  453. struct StorageSizeAndCount {
  454. /// total size stored, including both alive and dead bytes
  455. pub stored_size: usize,
  456. /// number of accounts in the storage including both alive and dead accounts
  457. pub count: usize,
  458. }
  459. type StorageSizeAndCountMap =
  460. DashMap<AccountsFileId, StorageSizeAndCount, BuildNoHashHasher<AccountsFileId>>;
  461. impl GenerateIndexTimings {
  462. pub fn report(&self, startup_stats: &StartupStats) {
  463. datapoint_info!(
  464. "generate_index",
  465. ("overall_us", self.total_time_us, i64),
  466. // we cannot accurately measure index insertion time because of many threads and lock contention
  467. ("total_us", self.index_time, i64),
  468. ("insertion_time_us", self.insertion_time_us, i64),
  469. (
  470. "storage_size_storages_us",
  471. self.storage_size_storages_us,
  472. i64
  473. ),
  474. ("index_flush_us", self.index_flush_us, i64),
  475. (
  476. "total_items_including_duplicates",
  477. self.total_including_duplicates,
  478. i64
  479. ),
  480. (
  481. "visit_duplicate_accounts_us",
  482. self.visit_duplicate_accounts_time_us,
  483. i64
  484. ),
  485. (
  486. "total_duplicate_slot_keys",
  487. self.total_duplicate_slot_keys,
  488. i64
  489. ),
  490. (
  491. "total_num_unique_duplicate_keys",
  492. self.total_num_unique_duplicate_keys,
  493. i64
  494. ),
  495. ("num_duplicate_accounts", self.num_duplicate_accounts, i64),
  496. (
  497. "populate_duplicate_keys_us",
  498. self.populate_duplicate_keys_us,
  499. i64
  500. ),
  501. ("total_slots", self.total_slots, i64),
  502. (
  503. "copy_data_us",
  504. startup_stats.copy_data_us.swap(0, Ordering::Relaxed),
  505. i64
  506. ),
  507. (
  508. "num_zero_lamport_single_refs",
  509. self.num_zero_lamport_single_refs,
  510. i64
  511. ),
  512. ("visit_zero_lamports_us", self.visit_zero_lamports_us, i64),
  513. (
  514. "all_accounts_are_zero_lamports_slots",
  515. self.all_accounts_are_zero_lamports_slots,
  516. i64
  517. ),
  518. (
  519. "mark_obsolete_accounts_us",
  520. self.mark_obsolete_accounts_us,
  521. i64
  522. ),
  523. (
  524. "num_obsolete_accounts_marked",
  525. self.num_obsolete_accounts_marked,
  526. i64
  527. ),
  528. (
  529. "num_slots_removed_as_obsolete",
  530. self.num_slots_removed_as_obsolete,
  531. i64
  532. ),
  533. (
  534. "num_obsolete_accounts_skipped",
  535. self.num_obsolete_accounts_skipped,
  536. i64
  537. ),
  538. );
  539. }
  540. }
  541. impl IsZeroLamport for AccountSharedData {
  542. fn is_zero_lamport(&self) -> bool {
  543. self.lamports() == 0
  544. }
  545. }
  546. impl IsZeroLamport for Account {
  547. fn is_zero_lamport(&self) -> bool {
  548. self.lamports() == 0
  549. }
  550. }
  551. /// An offset into the AccountsDb::storage vector
  552. pub type AtomicAccountsFileId = AtomicU32;
  553. pub type AccountsFileId = u32;
  554. type AccountSlots = HashMap<Pubkey, IntSet<Slot>>;
  555. type SlotOffsets = IntMap<Slot, IntSet<Offset>>;
  556. type ReclaimResult = (AccountSlots, SlotOffsets);
  557. type PubkeysRemovedFromAccountsIndex = HashSet<Pubkey>;
  558. type ShrinkCandidates = IntSet<Slot>;
  559. // Some hints for applicability of additional sanity checks for the do_load fast-path;
  560. // Slower fallback code path will be taken if the fast path has failed over the retry
  561. // threshold, regardless of these hints. Also, load cannot fail not-deterministically
  562. // even under very rare circumstances, unlike previously did allow.
  563. #[derive(Clone, Copy, Debug, PartialEq, Eq)]
  564. pub enum LoadHint {
  565. // Caller hints that it's loading transactions for a block which is
  566. // descended from the current root, and at the tip of its fork.
  567. // Thereby, further this assumes AccountIndex::max_root should not increase
  568. // during this load, meaning there should be no squash.
  569. // Overall, this enables us to assert!() strictly while running the fast-path for
  570. // account loading, while maintaining the determinism of account loading and resultant
  571. // transaction execution thereof.
  572. FixedMaxRoot,
  573. /// same as `FixedMaxRoot`, except do not populate the read cache on load
  574. FixedMaxRootDoNotPopulateReadCache,
  575. // Caller can't hint the above safety assumption. Generally RPC and miscellaneous
  576. // other call-site falls into this category. The likelihood of slower path is slightly
  577. // increased as well.
  578. Unspecified,
  579. }
  580. #[derive(Debug)]
  581. pub enum LoadedAccountAccessor<'a> {
  582. // StoredAccountInfo can't be held directly here due to its lifetime dependency on
  583. // AccountStorageEntry
  584. Stored(Option<(Arc<AccountStorageEntry>, usize)>),
  585. // None value in Cached variant means the cache was flushed
  586. Cached(Option<Cow<'a, Arc<CachedAccount>>>),
  587. }
  588. impl LoadedAccountAccessor<'_> {
  589. fn check_and_get_loaded_account_shared_data(&mut self) -> AccountSharedData {
  590. // all of these following .expect() and .unwrap() are like serious logic errors,
  591. // ideal for representing this as rust type system....
  592. match self {
  593. LoadedAccountAccessor::Stored(Some((maybe_storage_entry, offset))) => {
  594. // If we do find the storage entry, we can guarantee that the storage entry is
  595. // safe to read from because we grabbed a reference to the storage entry while it
  596. // was still in the storage map. This means even if the storage entry is removed
  597. // from the storage map after we grabbed the storage entry, the recycler should not
  598. // reset the storage entry until we drop the reference to the storage entry.
  599. maybe_storage_entry
  600. .accounts
  601. .get_account_shared_data(*offset)
  602. .expect(
  603. "If a storage entry was found in the storage map, it must not have been \
  604. reset yet",
  605. )
  606. }
  607. _ => self.check_and_get_loaded_account(|loaded_account| loaded_account.take_account()),
  608. }
  609. }
  610. fn check_and_get_loaded_account<T>(
  611. &mut self,
  612. callback: impl for<'local> FnMut(LoadedAccount<'local>) -> T,
  613. ) -> T {
  614. // all of these following .expect() and .unwrap() are like serious logic errors,
  615. // ideal for representing this as rust type system....
  616. match self {
  617. LoadedAccountAccessor::Cached(None) | LoadedAccountAccessor::Stored(None) => {
  618. panic!(
  619. "Should have already been taken care of when creating this \
  620. LoadedAccountAccessor"
  621. );
  622. }
  623. LoadedAccountAccessor::Cached(Some(_cached_account)) => {
  624. // Cached(Some(x)) variant always produces `Some` for get_loaded_account() since
  625. // it just returns the inner `x` without additional fetches
  626. self.get_loaded_account(callback).unwrap()
  627. }
  628. LoadedAccountAccessor::Stored(Some(_maybe_storage_entry)) => {
  629. // If we do find the storage entry, we can guarantee that the storage entry is
  630. // safe to read from because we grabbed a reference to the storage entry while it
  631. // was still in the storage map. This means even if the storage entry is removed
  632. // from the storage map after we grabbed the storage entry, the recycler should not
  633. // reset the storage entry until we drop the reference to the storage entry.
  634. self.get_loaded_account(callback).expect(
  635. "If a storage entry was found in the storage map, it must not have been reset \
  636. yet",
  637. )
  638. }
  639. }
  640. }
  641. fn get_loaded_account<T>(
  642. &mut self,
  643. mut callback: impl for<'local> FnMut(LoadedAccount<'local>) -> T,
  644. ) -> Option<T> {
  645. match self {
  646. LoadedAccountAccessor::Cached(cached_account) => {
  647. let cached_account = cached_account.take().expect(
  648. "Cache flushed/purged should be handled before trying to fetch account",
  649. );
  650. Some(callback(LoadedAccount::Cached(cached_account)))
  651. }
  652. LoadedAccountAccessor::Stored(maybe_storage_entry) => {
  653. // storage entry may not be present if slot was cleaned up in
  654. // between reading the accounts index and calling this function to
  655. // get account meta from the storage entry here
  656. maybe_storage_entry
  657. .as_ref()
  658. .and_then(|(storage_entry, offset)| {
  659. storage_entry
  660. .accounts
  661. .get_stored_account_callback(*offset, |account| {
  662. callback(LoadedAccount::Stored(account))
  663. })
  664. })
  665. }
  666. }
  667. }
  668. }
  669. pub enum LoadedAccount<'a> {
  670. Stored(StoredAccountInfo<'a>),
  671. Cached(Cow<'a, Arc<CachedAccount>>),
  672. }
  673. impl LoadedAccount<'_> {
  674. pub fn pubkey(&self) -> &Pubkey {
  675. match self {
  676. LoadedAccount::Stored(stored_account) => stored_account.pubkey(),
  677. LoadedAccount::Cached(cached_account) => cached_account.pubkey(),
  678. }
  679. }
  680. pub fn take_account(&self) -> AccountSharedData {
  681. match self {
  682. LoadedAccount::Stored(stored_account) => create_account_shared_data(stored_account),
  683. LoadedAccount::Cached(cached_account) => match cached_account {
  684. Cow::Owned(cached_account) => cached_account.account.clone(),
  685. Cow::Borrowed(cached_account) => cached_account.account.clone(),
  686. },
  687. }
  688. }
  689. pub fn is_cached(&self) -> bool {
  690. match self {
  691. LoadedAccount::Stored(_) => false,
  692. LoadedAccount::Cached(_) => true,
  693. }
  694. }
  695. /// data_len can be calculated without having access to `&data` in future implementations
  696. pub fn data_len(&self) -> usize {
  697. self.data().len()
  698. }
  699. }
  700. impl ReadableAccount for LoadedAccount<'_> {
  701. fn lamports(&self) -> u64 {
  702. match self {
  703. LoadedAccount::Stored(stored_account) => stored_account.lamports(),
  704. LoadedAccount::Cached(cached_account) => cached_account.account.lamports(),
  705. }
  706. }
  707. fn data(&self) -> &[u8] {
  708. match self {
  709. LoadedAccount::Stored(stored_account) => stored_account.data(),
  710. LoadedAccount::Cached(cached_account) => cached_account.account.data(),
  711. }
  712. }
  713. fn owner(&self) -> &Pubkey {
  714. match self {
  715. LoadedAccount::Stored(stored_account) => stored_account.owner(),
  716. LoadedAccount::Cached(cached_account) => cached_account.account.owner(),
  717. }
  718. }
  719. fn executable(&self) -> bool {
  720. match self {
  721. LoadedAccount::Stored(stored_account) => stored_account.executable(),
  722. LoadedAccount::Cached(cached_account) => cached_account.account.executable(),
  723. }
  724. }
  725. fn rent_epoch(&self) -> Epoch {
  726. match self {
  727. LoadedAccount::Stored(stored_account) => stored_account.rent_epoch(),
  728. LoadedAccount::Cached(cached_account) => cached_account.account.rent_epoch(),
  729. }
  730. }
  731. fn to_account_shared_data(&self) -> AccountSharedData {
  732. self.take_account()
  733. }
  734. }
  735. #[derive(Default)]
  736. struct CleanKeyTimings {
  737. collect_delta_keys_us: u64,
  738. delta_insert_us: u64,
  739. dirty_store_processing_us: u64,
  740. delta_key_count: u64,
  741. dirty_pubkeys_count: u64,
  742. oldest_dirty_slot: Slot,
  743. /// number of ancient append vecs that were scanned because they were dirty when clean started
  744. dirty_ancient_stores: usize,
  745. }
  746. /// Persistent storage structure holding the accounts
  747. #[derive(Debug)]
  748. pub struct AccountStorageEntry {
  749. pub(crate) id: AccountsFileId,
  750. pub(crate) slot: Slot,
  751. /// storage holding the accounts
  752. pub accounts: AccountsFile,
  753. /// The number of alive accounts in this storage
  754. count: AtomicUsize,
  755. alive_bytes: AtomicUsize,
  756. /// offsets to accounts that are zero lamport single ref stored in this
  757. /// storage. These are still alive. But, shrink will be able to remove them.
  758. ///
  759. /// NOTE: It's possible that one of these zero lamport single ref accounts
  760. /// could be written in a new transaction (and later rooted & flushed) and a
  761. /// later clean runs and marks this account dead before this storage gets a
  762. /// chance to be shrunk, thus making the account dead in both "alive_bytes"
  763. /// and as a zero lamport single ref. If this happens, we will count this
  764. /// account as "dead" twice. However, this should be fine. It just makes
  765. /// shrink more likely to visit this storage.
  766. zero_lamport_single_ref_offsets: RwLock<IntSet<Offset>>,
  767. /// Obsolete Accounts. These are accounts that are still present in the storage
  768. /// but should be ignored during rebuild. They have been removed
  769. /// from the accounts index, so they will not be picked up by scan.
  770. /// Slot is the slot at which the account is no longer needed.
  771. /// Two scenarios cause an account entry to be marked obsolete
  772. /// 1. The account was rewritten to a newer slot
  773. /// 2. The account was set to zero lamports and is older than the last
  774. /// full snapshot. In this case, slot is set to the snapshot slot
  775. obsolete_accounts: RwLock<ObsoleteAccounts>,
  776. }
  777. impl AccountStorageEntry {
  778. pub fn new(
  779. path: &Path,
  780. slot: Slot,
  781. id: AccountsFileId,
  782. file_size: u64,
  783. provider: AccountsFileProvider,
  784. storage_access: StorageAccess,
  785. ) -> Self {
  786. let tail = AccountsFile::file_name(slot, id);
  787. let path = Path::new(path).join(tail);
  788. let accounts = provider.new_writable(path, file_size, storage_access);
  789. Self {
  790. id,
  791. slot,
  792. accounts,
  793. count: AtomicUsize::new(0),
  794. alive_bytes: AtomicUsize::new(0),
  795. zero_lamport_single_ref_offsets: RwLock::default(),
  796. obsolete_accounts: RwLock::default(),
  797. }
  798. }
  799. /// open a new instance of the storage that is readonly
  800. #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
  801. fn reopen_as_readonly(&self, storage_access: StorageAccess) -> Option<Self> {
  802. if storage_access != StorageAccess::File {
  803. // if we are only using mmap, then no reason to re-open
  804. return None;
  805. }
  806. self.accounts.reopen_as_readonly().map(|accounts| Self {
  807. id: self.id,
  808. slot: self.slot,
  809. count: AtomicUsize::new(self.count()),
  810. alive_bytes: AtomicUsize::new(self.alive_bytes()),
  811. accounts,
  812. zero_lamport_single_ref_offsets: RwLock::new(
  813. self.zero_lamport_single_ref_offsets.read().unwrap().clone(),
  814. ),
  815. obsolete_accounts: RwLock::new(self.obsolete_accounts.read().unwrap().clone()),
  816. })
  817. }
  818. pub fn new_existing(
  819. slot: Slot,
  820. id: AccountsFileId,
  821. accounts: AccountsFile,
  822. obsolete_accounts: ObsoleteAccounts,
  823. ) -> Self {
  824. Self {
  825. id,
  826. slot,
  827. accounts,
  828. count: AtomicUsize::new(0),
  829. alive_bytes: AtomicUsize::new(0),
  830. zero_lamport_single_ref_offsets: RwLock::default(),
  831. obsolete_accounts: RwLock::new(obsolete_accounts),
  832. }
  833. }
  834. /// Returns the number of alive accounts in this storage
  835. pub fn count(&self) -> usize {
  836. self.count.load(Ordering::Acquire)
  837. }
  838. pub fn alive_bytes(&self) -> usize {
  839. self.alive_bytes.load(Ordering::Acquire)
  840. }
  841. /// Returns the accounts that were marked obsolete as of the passed in slot
  842. /// or earlier. Returned data includes the slots that the accounts were marked
  843. /// obsolete at
  844. pub fn obsolete_accounts_for_snapshots(&self, slot: Slot) -> ObsoleteAccounts {
  845. self.obsolete_accounts_read_lock()
  846. .obsolete_accounts_for_snapshots(slot)
  847. }
  848. /// Locks obsolete accounts with a read lock and returns the the accounts with the guard
  849. pub(crate) fn obsolete_accounts_read_lock(&self) -> RwLockReadGuard<'_, ObsoleteAccounts> {
  850. self.obsolete_accounts.read().unwrap()
  851. }
  852. /// Returns the number of bytes that were marked obsolete as of the passed
  853. /// in slot or earlier. If slot is None, then slot will be assumed to be the
  854. /// max root, and all obsolete bytes will be returned.
  855. pub fn get_obsolete_bytes(&self, slot: Option<Slot>) -> usize {
  856. let obsolete_bytes: usize = self
  857. .obsolete_accounts_read_lock()
  858. .filter_obsolete_accounts(slot)
  859. .map(|(offset, data_len)| {
  860. self.accounts
  861. .calculate_stored_size(data_len)
  862. .min(self.accounts.len() - offset)
  863. })
  864. .sum();
  865. obsolete_bytes
  866. }
  867. /// Return true if offset is "new" and inserted successfully. Otherwise,
  868. /// return false if the offset exists already.
  869. fn insert_zero_lamport_single_ref_account_offset(&self, offset: usize) -> bool {
  870. let mut zero_lamport_single_ref_offsets =
  871. self.zero_lamport_single_ref_offsets.write().unwrap();
  872. zero_lamport_single_ref_offsets.insert(offset)
  873. }
  874. /// Insert offsets into the zero lamport single ref account offset set.
  875. /// Return the number of new offsets that were inserted.
  876. fn batch_insert_zero_lamport_single_ref_account_offsets(&self, offsets: &[Offset]) -> u64 {
  877. let mut zero_lamport_single_ref_offsets =
  878. self.zero_lamport_single_ref_offsets.write().unwrap();
  879. let mut count = 0;
  880. for offset in offsets {
  881. if zero_lamport_single_ref_offsets.insert(*offset) {
  882. count += 1;
  883. }
  884. }
  885. count
  886. }
  887. /// Return the number of zero_lamport_single_ref accounts in the storage.
  888. fn num_zero_lamport_single_ref_accounts(&self) -> usize {
  889. self.zero_lamport_single_ref_offsets.read().unwrap().len()
  890. }
  891. /// Return the "alive_bytes" minus "zero_lamport_single_ref_accounts bytes".
  892. fn alive_bytes_exclude_zero_lamport_single_ref_accounts(&self) -> usize {
  893. let zero_lamport_dead_bytes = self
  894. .accounts
  895. .dead_bytes_due_to_zero_lamport_single_ref(self.num_zero_lamport_single_ref_accounts());
  896. self.alive_bytes().saturating_sub(zero_lamport_dead_bytes)
  897. }
  898. /// Returns the number of bytes used in this storage
  899. pub fn written_bytes(&self) -> u64 {
  900. self.accounts.len() as u64
  901. }
  902. /// Returns the number of bytes, not accounts, this storage can hold
  903. pub fn capacity(&self) -> u64 {
  904. self.accounts.capacity()
  905. }
  906. pub fn has_accounts(&self) -> bool {
  907. self.count() > 0
  908. }
  909. pub fn slot(&self) -> Slot {
  910. self.slot
  911. }
  912. pub fn id(&self) -> AccountsFileId {
  913. self.id
  914. }
  915. pub fn flush(&self) -> Result<(), AccountsFileError> {
  916. self.accounts.flush()
  917. }
  918. fn add_accounts(&self, num_accounts: usize, num_bytes: usize) {
  919. self.count.fetch_add(num_accounts, Ordering::Release);
  920. self.alive_bytes.fetch_add(num_bytes, Ordering::Release);
  921. }
  922. /// Removes `num_bytes` and `num_accounts` from the storage,
  923. /// and returns the remaining number of accounts.
  924. fn remove_accounts(&self, num_bytes: usize, num_accounts: usize) -> usize {
  925. let prev_alive_bytes = self.alive_bytes.fetch_sub(num_bytes, Ordering::Release);
  926. let prev_count = self.count.fetch_sub(num_accounts, Ordering::Release);
  927. // enforce invariant that we're not removing too many bytes or accounts
  928. assert!(
  929. num_bytes <= prev_alive_bytes && num_accounts <= prev_count,
  930. "Too many bytes or accounts removed from storage! slot: {}, id: {}, initial alive \
  931. bytes: {prev_alive_bytes}, initial num accounts: {prev_count}, num bytes removed: \
  932. {num_bytes}, num accounts removed: {num_accounts}",
  933. self.slot,
  934. self.id,
  935. );
  936. // SAFETY: subtraction is safe since we just asserted num_accounts <= prev_num_accounts
  937. prev_count - num_accounts
  938. }
  939. /// Returns the path to the underlying accounts storage file
  940. pub fn path(&self) -> &Path {
  941. self.accounts.path()
  942. }
  943. }
  944. pub fn get_temp_accounts_paths(count: u32) -> io::Result<(Vec<TempDir>, Vec<PathBuf>)> {
  945. let temp_dirs: io::Result<Vec<TempDir>> = (0..count).map(|_| TempDir::new()).collect();
  946. let temp_dirs = temp_dirs?;
  947. let paths: io::Result<Vec<_>> = temp_dirs
  948. .iter()
  949. .map(|temp_dir| {
  950. utils::create_accounts_run_and_snapshot_dirs(temp_dir)
  951. .map(|(run_dir, _snapshot_dir)| run_dir)
  952. })
  953. .collect();
  954. let paths = paths?;
  955. Ok((temp_dirs, paths))
  956. }
  957. #[derive(Default, Debug)]
  958. struct CleaningInfo {
  959. slot_list: SlotList<AccountInfo>,
  960. ref_count: RefCount,
  961. /// Indicates if this account might have a zero lamport index entry.
  962. /// If false, the account *shall* not have zero lamport index entries.
  963. /// If true, the account *might* have zero lamport index entries.
  964. might_contain_zero_lamport_entry: bool,
  965. }
  966. /// Indicates when to mark accounts obsolete
  967. /// * Disabled - do not mark accounts obsolete
  968. /// * Enabled - mark accounts obsolete during write cache flush
  969. #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
  970. pub enum MarkObsoleteAccounts {
  971. #[default]
  972. Disabled,
  973. Enabled,
  974. }
  975. /// This is the return type of AccountsDb::construct_candidate_clean_keys.
  976. /// It's a collection of pubkeys with associated information to
  977. /// facilitate the decision making about which accounts can be removed
  978. /// from the accounts index. In addition, the minimal dirty slot is
  979. /// included in the returned value.
  980. type CleaningCandidates = (Box<[RwLock<HashMap<Pubkey, CleaningInfo>>]>, Option<Slot>);
  981. /// Removing unrooted slots in Accounts Background Service needs to be synchronized with flushing
  982. /// slots from the Accounts Cache. This keeps track of those slots and the Mutex + Condvar for
  983. /// synchronization.
  984. #[derive(Debug, Default)]
  985. struct RemoveUnrootedSlotsSynchronization {
  986. // slots being flushed from the cache or being purged
  987. slots_under_contention: Mutex<IntSet<Slot>>,
  988. signal: Condvar,
  989. }
  990. type AccountInfoAccountsIndex = AccountsIndex<AccountInfo, AccountInfo>;
  991. // This structure handles the load/store of the accounts
  992. #[derive(Debug)]
  993. pub struct AccountsDb {
  994. /// Keeps tracks of index into AppendVec on a per slot basis
  995. pub accounts_index: AccountInfoAccountsIndex,
  996. /// Some(offset) iff we want to squash old append vecs together into 'ancient append vecs'
  997. /// Some(offset) means for slots up to (max_slot - (slots_per_epoch - 'offset')), put them in ancient append vecs
  998. pub ancient_append_vec_offset: Option<i64>,
  999. pub ancient_storage_ideal_size: u64,
  1000. pub max_ancient_storages: usize,
  1001. /// true iff we want to skip the initial hash calculation on startup
  1002. pub skip_initial_hash_calc: bool,
  1003. pub storage: AccountStorage,
  1004. pub accounts_cache: AccountsCache,
  1005. write_cache_limit_bytes: Option<u64>,
  1006. read_only_accounts_cache: ReadOnlyAccountsCache,
  1007. /// distribute the accounts across storage lists
  1008. pub next_id: AtomicAccountsFileId,
  1009. /// Set of shrinkable stores organized by map of slot to storage id
  1010. pub shrink_candidate_slots: Mutex<ShrinkCandidates>,
  1011. pub write_version: AtomicU64,
  1012. /// Set of storage paths to pick from
  1013. pub paths: Vec<PathBuf>,
  1014. /// Base directory for various necessary files
  1015. base_working_path: PathBuf,
  1016. // used by tests - held until we are dropped
  1017. #[allow(dead_code)]
  1018. base_working_temp_dir: Option<TempDir>,
  1019. shrink_paths: Vec<PathBuf>,
  1020. /// Directory of paths this accounts_db needs to hold/remove
  1021. #[allow(dead_code)]
  1022. pub temp_paths: Option<Vec<TempDir>>,
  1023. /// Starting file size of appendvecs
  1024. file_size: u64,
  1025. /// Thread pool for foreground tasks, e.g. transaction processing
  1026. pub thread_pool_foreground: ThreadPool,
  1027. /// Thread pool for background tasks, e.g. AccountsBackgroundService and flush/clean/shrink
  1028. pub thread_pool_background: ThreadPool,
  1029. pub stats: AccountsStats,
  1030. clean_accounts_stats: CleanAccountsStats,
  1031. // Stats for purges called outside of clean_accounts()
  1032. external_purge_slots_stats: PurgeStats,
  1033. pub shrink_stats: ShrinkStats,
  1034. pub(crate) shrink_ancient_stats: ShrinkAncientStats,
  1035. pub account_indexes: AccountSecondaryIndexes,
  1036. /// Set of unique keys per slot which is used
  1037. /// to drive clean_accounts
  1038. /// Populated when flushing the accounts write cache
  1039. uncleaned_pubkeys: DashMap<Slot, Vec<Pubkey>, BuildNoHashHasher<Slot>>,
  1040. #[cfg(test)]
  1041. load_delay: u64,
  1042. #[cfg(test)]
  1043. load_limit: AtomicU64,
  1044. /// true if drop_callback is attached to the bank.
  1045. is_bank_drop_callback_enabled: AtomicBool,
  1046. /// Set of slots currently being flushed by `flush_slot_cache()` or removed
  1047. /// by `remove_unrooted_slot()`. Used to ensure `remove_unrooted_slots(slots)`
  1048. /// can safely clear the set of unrooted slots `slots`.
  1049. remove_unrooted_slots_synchronization: RemoveUnrootedSlotsSynchronization,
  1050. shrink_ratio: AccountShrinkThreshold,
  1051. /// Set of stores which are recently rooted or had accounts removed
  1052. /// such that potentially a 0-lamport account update could be present which
  1053. /// means we can remove the account from the index entirely.
  1054. dirty_stores: DashMap<Slot, Arc<AccountStorageEntry>, BuildNoHashHasher<Slot>>,
  1055. /// Zero-lamport accounts that are *not* purged during clean because they need to stay alive
  1056. /// for incremental snapshot support.
  1057. zero_lamport_accounts_to_purge_after_full_snapshot: DashSet<(Slot, Pubkey)>,
  1058. /// GeyserPlugin accounts update notifier
  1059. accounts_update_notifier: Option<AccountsUpdateNotifier>,
  1060. pub(crate) active_stats: ActiveStats,
  1061. /// Used to disable logging dead slots during removal.
  1062. /// allow disabling noisy log
  1063. pub log_dead_slots: AtomicBool,
  1064. /// debug feature to scan every append vec and verify refcounts are equal
  1065. exhaustively_verify_refcounts: bool,
  1066. /// storage format to use for new storages
  1067. accounts_file_provider: AccountsFileProvider,
  1068. /// method to use for accessing storages
  1069. storage_access: StorageAccess,
  1070. /// index scan filtering for shrinking
  1071. scan_filter_for_shrinking: ScanFilter,
  1072. /// this will live here until the feature for partitioned epoch rewards is activated.
  1073. /// At that point, this and other code can be deleted.
  1074. pub partitioned_epoch_rewards_config: PartitionedEpochRewardsConfig,
  1075. /// The latest full snapshot slot dictates how to handle zero lamport accounts
  1076. /// Note, this is None if we're told to *not* take snapshots
  1077. latest_full_snapshot_slot: SeqLock<Option<Slot>>,
  1078. /// These are the ancient storages that could be valuable to
  1079. /// shrink, sorted by amount of dead bytes. The elements
  1080. /// are sorted from the largest dead bytes to the smallest.
  1081. /// Members are Slot and capacity. If capacity is smaller, then
  1082. /// that means the storage was already shrunk.
  1083. pub(crate) best_ancient_slots_to_shrink: RwLock<VecDeque<(Slot, u64)>>,
  1084. /// Flag to indicate if the experimental obsolete account tracking feature is enabled.
  1085. /// This feature tracks obsolete accounts in the account storage entry allowing
  1086. /// for earlier cleaning of obsolete accounts in the storages and index.
  1087. pub mark_obsolete_accounts: MarkObsoleteAccounts,
  1088. }
  1089. pub fn quarter_thread_count() -> usize {
  1090. std::cmp::max(2, num_cpus::get() / 4)
  1091. }
  1092. pub fn default_num_foreground_threads() -> usize {
  1093. get_thread_count()
  1094. }
  1095. #[cfg(feature = "frozen-abi")]
  1096. impl solana_frozen_abi::abi_example::AbiExample for AccountsDb {
  1097. fn example() -> Self {
  1098. let accounts_db = AccountsDb::new_single_for_tests();
  1099. let key = Pubkey::default();
  1100. let some_data_len = 5;
  1101. let some_slot: Slot = 0;
  1102. let account = AccountSharedData::new(1, some_data_len, &key);
  1103. accounts_db.store_for_tests((some_slot, [(&key, &account)].as_slice()));
  1104. accounts_db.add_root_and_flush_write_cache(0);
  1105. accounts_db
  1106. }
  1107. }
  1108. impl AccountsDb {
  1109. // The default high and low watermark sizes for the accounts read cache.
  1110. // If the cache size exceeds MAX_SIZE_HI, it'll evict entries until the size is <= MAX_SIZE_LO.
  1111. //
  1112. // These default values were chosen empirically to minimize evictions on mainnet-beta.
  1113. // As of 2025-08-15 on mainnet-beta, the read cache size's steady state is around 2.5 GB,
  1114. // and add a bit more to buffer future growth.
  1115. #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
  1116. const DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_LO: usize = 3_000_000_000;
  1117. #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
  1118. const DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_HI: usize = 3_100_000_000;
  1119. // See AccountsDbConfig::read_cache_evict_sample_size.
  1120. #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
  1121. const DEFAULT_READ_ONLY_CACHE_EVICT_SAMPLE_SIZE: usize = 8;
  1122. pub fn new_with_config(
  1123. paths: Vec<PathBuf>,
  1124. accounts_db_config: AccountsDbConfig,
  1125. accounts_update_notifier: Option<AccountsUpdateNotifier>,
  1126. exit: Arc<AtomicBool>,
  1127. ) -> Self {
  1128. let accounts_index_config = accounts_db_config.index.unwrap_or_default();
  1129. let accounts_index = AccountsIndex::new(&accounts_index_config, exit);
  1130. let base_working_path = accounts_db_config.base_working_path.clone();
  1131. let (base_working_path, base_working_temp_dir) =
  1132. if let Some(base_working_path) = base_working_path {
  1133. (base_working_path, None)
  1134. } else {
  1135. let base_working_temp_dir = TempDir::new().unwrap();
  1136. let base_working_path = base_working_temp_dir.path().to_path_buf();
  1137. (base_working_path, Some(base_working_temp_dir))
  1138. };
  1139. let (paths, temp_paths) = if paths.is_empty() {
  1140. // Create a temporary set of accounts directories, used primarily
  1141. // for testing
  1142. let (temp_dirs, temp_paths) = get_temp_accounts_paths(DEFAULT_NUM_DIRS).unwrap();
  1143. (temp_paths, Some(temp_dirs))
  1144. } else {
  1145. (paths, None)
  1146. };
  1147. let shrink_paths = accounts_db_config
  1148. .shrink_paths
  1149. .clone()
  1150. .unwrap_or_else(|| paths.clone());
  1151. let read_cache_size = accounts_db_config.read_cache_limit_bytes.unwrap_or((
  1152. Self::DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_LO,
  1153. Self::DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_HI,
  1154. ));
  1155. let read_cache_evict_sample_size = accounts_db_config
  1156. .read_cache_evict_sample_size
  1157. .unwrap_or(Self::DEFAULT_READ_ONLY_CACHE_EVICT_SAMPLE_SIZE);
  1158. // Increase the stack for foreground threads
  1159. // rayon needs a lot of stack
  1160. const ACCOUNTS_STACK_SIZE: usize = 8 * 1024 * 1024;
  1161. let num_foreground_threads = accounts_db_config
  1162. .num_foreground_threads
  1163. .map(Into::into)
  1164. .unwrap_or_else(default_num_foreground_threads);
  1165. let thread_pool_foreground = rayon::ThreadPoolBuilder::new()
  1166. .num_threads(num_foreground_threads)
  1167. .thread_name(|i| format!("solAcctsDbFg{i:02}"))
  1168. .stack_size(ACCOUNTS_STACK_SIZE)
  1169. .build()
  1170. .expect("new rayon threadpool");
  1171. let num_background_threads = accounts_db_config
  1172. .num_background_threads
  1173. .map(Into::into)
  1174. .unwrap_or_else(quarter_thread_count);
  1175. let thread_pool_background = rayon::ThreadPoolBuilder::new()
  1176. .thread_name(|i| format!("solAcctsDbBg{i:02}"))
  1177. .num_threads(num_background_threads)
  1178. .build()
  1179. .expect("new rayon threadpool");
  1180. let new = Self {
  1181. accounts_index,
  1182. paths,
  1183. base_working_path,
  1184. base_working_temp_dir,
  1185. temp_paths,
  1186. shrink_paths,
  1187. skip_initial_hash_calc: accounts_db_config.skip_initial_hash_calc,
  1188. ancient_append_vec_offset: accounts_db_config
  1189. .ancient_append_vec_offset
  1190. .or(ANCIENT_APPEND_VEC_DEFAULT_OFFSET),
  1191. ancient_storage_ideal_size: accounts_db_config
  1192. .ancient_storage_ideal_size
  1193. .unwrap_or(DEFAULT_ANCIENT_STORAGE_IDEAL_SIZE),
  1194. max_ancient_storages: accounts_db_config
  1195. .max_ancient_storages
  1196. .unwrap_or(DEFAULT_MAX_ANCIENT_STORAGES),
  1197. account_indexes: accounts_db_config.account_indexes.unwrap_or_default(),
  1198. shrink_ratio: accounts_db_config.shrink_ratio,
  1199. accounts_update_notifier,
  1200. read_only_accounts_cache: ReadOnlyAccountsCache::new(
  1201. read_cache_size.0,
  1202. read_cache_size.1,
  1203. read_cache_evict_sample_size,
  1204. ),
  1205. write_cache_limit_bytes: accounts_db_config.write_cache_limit_bytes,
  1206. partitioned_epoch_rewards_config: accounts_db_config.partitioned_epoch_rewards_config,
  1207. exhaustively_verify_refcounts: accounts_db_config.exhaustively_verify_refcounts,
  1208. storage_access: accounts_db_config.storage_access,
  1209. scan_filter_for_shrinking: accounts_db_config.scan_filter_for_shrinking,
  1210. thread_pool_foreground,
  1211. thread_pool_background,
  1212. active_stats: ActiveStats::default(),
  1213. storage: AccountStorage::default(),
  1214. accounts_cache: AccountsCache::default(),
  1215. uncleaned_pubkeys: DashMap::default(),
  1216. next_id: AtomicAccountsFileId::new(0),
  1217. shrink_candidate_slots: Mutex::new(ShrinkCandidates::default()),
  1218. write_version: AtomicU64::new(0),
  1219. file_size: DEFAULT_FILE_SIZE,
  1220. external_purge_slots_stats: PurgeStats::default(),
  1221. clean_accounts_stats: CleanAccountsStats::default(),
  1222. shrink_stats: ShrinkStats::default(),
  1223. shrink_ancient_stats: ShrinkAncientStats::default(),
  1224. stats: AccountsStats::default(),
  1225. #[cfg(test)]
  1226. load_delay: u64::default(),
  1227. #[cfg(test)]
  1228. load_limit: AtomicU64::default(),
  1229. is_bank_drop_callback_enabled: AtomicBool::default(),
  1230. remove_unrooted_slots_synchronization: RemoveUnrootedSlotsSynchronization::default(),
  1231. dirty_stores: DashMap::default(),
  1232. zero_lamport_accounts_to_purge_after_full_snapshot: DashSet::default(),
  1233. log_dead_slots: AtomicBool::new(true),
  1234. accounts_file_provider: AccountsFileProvider::default(),
  1235. latest_full_snapshot_slot: SeqLock::new(None),
  1236. best_ancient_slots_to_shrink: RwLock::default(),
  1237. mark_obsolete_accounts: accounts_db_config.mark_obsolete_accounts,
  1238. };
  1239. {
  1240. for path in new.paths.iter() {
  1241. std::fs::create_dir_all(path).expect("Create directory failed.");
  1242. }
  1243. }
  1244. new
  1245. }
  1246. pub fn file_size(&self) -> u64 {
  1247. self.file_size
  1248. }
  1249. /// Get the base working directory
  1250. pub fn get_base_working_path(&self) -> PathBuf {
  1251. self.base_working_path.clone()
  1252. }
  1253. /// Returns true if there is an accounts update notifier.
  1254. pub fn has_accounts_update_notifier(&self) -> bool {
  1255. self.accounts_update_notifier.is_some()
  1256. }
  1257. fn next_id(&self) -> AccountsFileId {
  1258. let next_id = self.next_id.fetch_add(1, Ordering::AcqRel);
  1259. assert!(
  1260. next_id != AccountsFileId::MAX,
  1261. "We've run out of storage ids!"
  1262. );
  1263. next_id
  1264. }
  1265. fn new_storage_entry(&self, slot: Slot, path: &Path, size: u64) -> AccountStorageEntry {
  1266. AccountStorageEntry::new(
  1267. path,
  1268. slot,
  1269. self.next_id(),
  1270. size,
  1271. self.accounts_file_provider,
  1272. self.storage_access,
  1273. )
  1274. }
  1275. /// While scanning cleaning candidates obtain slots that can be
  1276. /// reclaimed for each pubkey. In addition, if the pubkey is
  1277. /// removed from the index, insert in pubkeys_removed_from_accounts_index.
  1278. fn collect_reclaims(
  1279. &self,
  1280. pubkey: &Pubkey,
  1281. max_clean_root_inclusive: Option<Slot>,
  1282. ancient_account_cleans: &AtomicU64,
  1283. epoch_schedule: &EpochSchedule,
  1284. pubkeys_removed_from_accounts_index: &Mutex<PubkeysRemovedFromAccountsIndex>,
  1285. ) -> ReclaimsSlotList<AccountInfo> {
  1286. let one_epoch_old = self.get_oldest_non_ancient_slot(epoch_schedule);
  1287. let mut clean_rooted = Measure::start("clean_old_root-ms");
  1288. let mut reclaims = ReclaimsSlotList::new();
  1289. let removed_from_index = self.accounts_index.clean_rooted_entries(
  1290. pubkey,
  1291. &mut reclaims,
  1292. max_clean_root_inclusive,
  1293. );
  1294. if removed_from_index {
  1295. pubkeys_removed_from_accounts_index
  1296. .lock()
  1297. .unwrap()
  1298. .insert(*pubkey);
  1299. }
  1300. if !reclaims.is_empty() {
  1301. // figure out how many ancient accounts have been reclaimed
  1302. let old_reclaims = reclaims
  1303. .iter()
  1304. .filter_map(|(slot, _)| (slot < &one_epoch_old).then_some(1))
  1305. .sum();
  1306. ancient_account_cleans.fetch_add(old_reclaims, Ordering::Relaxed);
  1307. }
  1308. clean_rooted.stop();
  1309. self.clean_accounts_stats
  1310. .clean_old_root_us
  1311. .fetch_add(clean_rooted.as_us(), Ordering::Relaxed);
  1312. reclaims
  1313. }
  1314. /// Reclaim older states of accounts older than max_clean_root_inclusive for AccountsDb bloat mitigation.
  1315. /// Any accounts which are removed from the accounts index are returned in PubkeysRemovedFromAccountsIndex.
  1316. /// These should NOT be unref'd later from the accounts index.
  1317. fn clean_accounts_older_than_root(
  1318. &self,
  1319. reclaims: &SlotList<AccountInfo>,
  1320. pubkeys_removed_from_accounts_index: &HashSet<Pubkey>,
  1321. ) -> ReclaimResult {
  1322. if reclaims.is_empty() {
  1323. return ReclaimResult::default();
  1324. }
  1325. let (reclaim_result, reclaim_us) = measure_us!(self.handle_reclaims(
  1326. reclaims.iter(),
  1327. None,
  1328. pubkeys_removed_from_accounts_index,
  1329. HandleReclaims::ProcessDeadSlots(&self.clean_accounts_stats.purge_stats),
  1330. MarkAccountsObsolete::No,
  1331. ));
  1332. self.clean_accounts_stats
  1333. .clean_old_root_reclaim_us
  1334. .fetch_add(reclaim_us, Ordering::Relaxed);
  1335. reclaim_result
  1336. }
  1337. /// increment store_counts to non-zero for all stores that can not be deleted.
  1338. /// a store cannot be deleted if:
  1339. /// 1. one of the pubkeys in the store has account info to a store whose store count is not going to zero
  1340. /// 2. a pubkey we were planning to remove is not removing all stores that contain the account
  1341. fn calc_delete_dependencies(
  1342. &self,
  1343. candidates: &[HashMap<Pubkey, CleaningInfo>],
  1344. store_counts: &mut HashMap<Slot, (usize, HashSet<Pubkey>)>,
  1345. min_slot: Option<Slot>,
  1346. ) {
  1347. // Another pass to check if there are some filtered accounts which
  1348. // do not match the criteria of deleting all appendvecs which contain them
  1349. // then increment their storage count.
  1350. let mut already_counted = IntSet::default();
  1351. for (bin_index, bin) in candidates.iter().enumerate() {
  1352. for (pubkey, cleaning_info) in bin.iter() {
  1353. let slot_list = &cleaning_info.slot_list;
  1354. let ref_count = &cleaning_info.ref_count;
  1355. let mut failed_slot = None;
  1356. let all_stores_being_deleted = slot_list.len() as RefCount == *ref_count;
  1357. if all_stores_being_deleted {
  1358. let mut delete = true;
  1359. for (slot, _account_info) in slot_list {
  1360. if let Some(count) = store_counts.get(slot).map(|s| s.0) {
  1361. debug!("calc_delete_dependencies() slot: {slot}, count len: {count}");
  1362. if count == 0 {
  1363. // this store CAN be removed
  1364. continue;
  1365. }
  1366. }
  1367. // One of the pubkeys in the store has account info to a store whose store count is not going to zero.
  1368. // If the store cannot be found, that also means store isn't being deleted.
  1369. failed_slot = Some(*slot);
  1370. delete = false;
  1371. break;
  1372. }
  1373. if delete {
  1374. // this pubkey can be deleted from all stores it is in
  1375. continue;
  1376. }
  1377. } else {
  1378. // a pubkey we were planning to remove is not removing all stores that contain the account
  1379. debug!(
  1380. "calc_delete_dependencies(), pubkey: {pubkey}, slot list len: {}, ref \
  1381. count: {ref_count}, slot list: {slot_list:?}",
  1382. slot_list.len(),
  1383. );
  1384. }
  1385. // increment store_counts to non-zero for all stores that can not be deleted.
  1386. let mut pending_stores = IntSet::default();
  1387. for (slot, _account_info) in slot_list {
  1388. if !already_counted.contains(slot) {
  1389. pending_stores.insert(*slot);
  1390. }
  1391. }
  1392. while !pending_stores.is_empty() {
  1393. let slot = pending_stores.iter().next().cloned().unwrap();
  1394. if Some(slot) == min_slot {
  1395. if let Some(failed_slot) = failed_slot.take() {
  1396. info!(
  1397. "calc_delete_dependencies, oldest slot is not able to be deleted \
  1398. because of {pubkey} in slot {failed_slot}"
  1399. );
  1400. } else {
  1401. info!(
  1402. "calc_delete_dependencies, oldest slot is not able to be deleted \
  1403. because of {pubkey}, slot list len: {}, ref count: {ref_count}",
  1404. slot_list.len()
  1405. );
  1406. }
  1407. }
  1408. pending_stores.remove(&slot);
  1409. if !already_counted.insert(slot) {
  1410. continue;
  1411. }
  1412. // the point of all this code: remove the store count for all stores we cannot remove
  1413. if let Some(store_count) = store_counts.remove(&slot) {
  1414. // all pubkeys in this store also cannot be removed from all stores they are in
  1415. let affected_pubkeys = &store_count.1;
  1416. for key in affected_pubkeys {
  1417. let candidates_bin_index =
  1418. self.accounts_index.bin_calculator.bin_from_pubkey(key);
  1419. let mut update_pending_stores =
  1420. |bin: &HashMap<Pubkey, CleaningInfo>| {
  1421. for (slot, _account_info) in &bin.get(key).unwrap().slot_list {
  1422. if !already_counted.contains(slot) {
  1423. pending_stores.insert(*slot);
  1424. }
  1425. }
  1426. };
  1427. if candidates_bin_index == bin_index {
  1428. update_pending_stores(bin);
  1429. } else {
  1430. update_pending_stores(&candidates[candidates_bin_index]);
  1431. }
  1432. }
  1433. }
  1434. }
  1435. }
  1436. }
  1437. }
  1438. #[must_use]
  1439. pub fn purge_keys_exact<C>(
  1440. &self,
  1441. pubkey_to_slot_set: impl IntoIterator<Item = (Pubkey, C)>,
  1442. ) -> (
  1443. ReclaimsSlotList<AccountInfo>,
  1444. PubkeysRemovedFromAccountsIndex,
  1445. )
  1446. where
  1447. C: for<'a> Contains<'a, Slot>,
  1448. {
  1449. let mut reclaims = ReclaimsSlotList::new();
  1450. let mut dead_keys = Vec::new();
  1451. let mut purge_exact_count = 0;
  1452. let (_, purge_exact_us) =
  1453. measure_us!(for (pubkey, slots_set) in pubkey_to_slot_set.into_iter() {
  1454. purge_exact_count += 1;
  1455. let is_empty = self
  1456. .accounts_index
  1457. .purge_exact(&pubkey, slots_set, &mut reclaims);
  1458. if is_empty {
  1459. dead_keys.push(pubkey);
  1460. }
  1461. });
  1462. let (pubkeys_removed_from_accounts_index, handle_dead_keys_us) = measure_us!(self
  1463. .accounts_index
  1464. .handle_dead_keys(&dead_keys, &self.account_indexes));
  1465. self.stats
  1466. .purge_exact_count
  1467. .fetch_add(purge_exact_count, Ordering::Relaxed);
  1468. self.stats
  1469. .handle_dead_keys_us
  1470. .fetch_add(handle_dead_keys_us, Ordering::Relaxed);
  1471. self.stats
  1472. .purge_exact_us
  1473. .fetch_add(purge_exact_us, Ordering::Relaxed);
  1474. (reclaims, pubkeys_removed_from_accounts_index)
  1475. }
  1476. fn max_clean_root(&self, proposed_clean_root: Option<Slot>) -> Option<Slot> {
  1477. match (
  1478. self.accounts_index.min_ongoing_scan_root(),
  1479. proposed_clean_root,
  1480. ) {
  1481. (None, None) => None,
  1482. (Some(min_scan_root), None) => Some(min_scan_root),
  1483. (None, Some(proposed_clean_root)) => Some(proposed_clean_root),
  1484. (Some(min_scan_root), Some(proposed_clean_root)) => {
  1485. Some(std::cmp::min(min_scan_root, proposed_clean_root))
  1486. }
  1487. }
  1488. }
  1489. /// get the oldest slot that is within one epoch of the highest known root.
  1490. /// The slot will have been offset by `self.ancient_append_vec_offset`
  1491. fn get_oldest_non_ancient_slot(&self, epoch_schedule: &EpochSchedule) -> Slot {
  1492. self.get_oldest_non_ancient_slot_from_slot(
  1493. epoch_schedule,
  1494. self.accounts_index.max_root_inclusive(),
  1495. )
  1496. }
  1497. /// get the oldest slot that is within one epoch of `max_root_inclusive`.
  1498. /// The slot will have been offset by `self.ancient_append_vec_offset`
  1499. fn get_oldest_non_ancient_slot_from_slot(
  1500. &self,
  1501. epoch_schedule: &EpochSchedule,
  1502. max_root_inclusive: Slot,
  1503. ) -> Slot {
  1504. let mut result = max_root_inclusive;
  1505. if let Some(offset) = self.ancient_append_vec_offset {
  1506. result = Self::apply_offset_to_slot(result, offset);
  1507. }
  1508. result = Self::apply_offset_to_slot(
  1509. result,
  1510. -((epoch_schedule.slots_per_epoch as i64).saturating_sub(1)),
  1511. );
  1512. result.min(max_root_inclusive)
  1513. }
  1514. /// Collect all the uncleaned slots, up to a max slot
  1515. ///
  1516. /// Search through the uncleaned Pubkeys and return all the slots, up to a maximum slot.
  1517. fn collect_uncleaned_slots_up_to_slot(&self, max_slot_inclusive: Slot) -> Vec<Slot> {
  1518. self.uncleaned_pubkeys
  1519. .iter()
  1520. .filter_map(|entry| {
  1521. let slot = *entry.key();
  1522. (slot <= max_slot_inclusive).then_some(slot)
  1523. })
  1524. .collect()
  1525. }
  1526. /// For each slot in the list of uncleaned slots, up to a maximum
  1527. /// slot, remove it from the `uncleaned_pubkeys` and move all the
  1528. /// pubkeys to `candidates` for cleaning.
  1529. fn remove_uncleaned_slots_up_to_slot_and_move_pubkeys(
  1530. &self,
  1531. max_slot_inclusive: Slot,
  1532. candidates: &[RwLock<HashMap<Pubkey, CleaningInfo>>],
  1533. ) {
  1534. let uncleaned_slots = self.collect_uncleaned_slots_up_to_slot(max_slot_inclusive);
  1535. for uncleaned_slot in uncleaned_slots.into_iter() {
  1536. if let Some((_removed_slot, mut removed_pubkeys)) =
  1537. self.uncleaned_pubkeys.remove(&uncleaned_slot)
  1538. {
  1539. // Sort all keys by bin index so that we can insert
  1540. // them in `candidates` more efficiently.
  1541. removed_pubkeys.sort_by(|a, b| {
  1542. self.accounts_index
  1543. .bin_calculator
  1544. .bin_from_pubkey(a)
  1545. .cmp(&self.accounts_index.bin_calculator.bin_from_pubkey(b))
  1546. });
  1547. if let Some(first_removed_pubkey) = removed_pubkeys.first() {
  1548. let mut prev_bin = self
  1549. .accounts_index
  1550. .bin_calculator
  1551. .bin_from_pubkey(first_removed_pubkey);
  1552. let mut candidates_bin = candidates[prev_bin].write().unwrap();
  1553. for removed_pubkey in removed_pubkeys {
  1554. let curr_bin = self
  1555. .accounts_index
  1556. .bin_calculator
  1557. .bin_from_pubkey(&removed_pubkey);
  1558. if curr_bin != prev_bin {
  1559. candidates_bin = candidates[curr_bin].write().unwrap();
  1560. prev_bin = curr_bin;
  1561. }
  1562. // Conservatively mark the candidate might have a zero lamport entry for
  1563. // correctness so that scan WILL try to look in disk if it is
  1564. // not in-mem. These keys are from 1) recently processed
  1565. // slots, 2) zero lamports found in shrink. Therefore, they are very likely
  1566. // to be in-memory, and seldomly do we need to look them up in disk.
  1567. candidates_bin.insert(
  1568. removed_pubkey,
  1569. CleaningInfo {
  1570. might_contain_zero_lamport_entry: true,
  1571. ..Default::default()
  1572. },
  1573. );
  1574. }
  1575. }
  1576. }
  1577. }
  1578. }
  1579. fn count_pubkeys(candidates: &[RwLock<HashMap<Pubkey, CleaningInfo>>]) -> u64 {
  1580. candidates
  1581. .iter()
  1582. .map(|x| x.read().unwrap().len())
  1583. .sum::<usize>() as u64
  1584. }
  1585. /// Construct a list of candidates for cleaning from:
  1586. /// - dirty_stores -- set of stores which had accounts removed or recently rooted
  1587. /// - uncleaned_pubkeys -- the delta set of updated pubkeys in rooted slots from the last clean
  1588. ///
  1589. /// The function also returns the minimum slot we encountered.
  1590. fn construct_candidate_clean_keys(
  1591. &self,
  1592. max_clean_root_inclusive: Option<Slot>,
  1593. is_startup: bool,
  1594. timings: &mut CleanKeyTimings,
  1595. epoch_schedule: &EpochSchedule,
  1596. ) -> CleaningCandidates {
  1597. let oldest_non_ancient_slot = self.get_oldest_non_ancient_slot(epoch_schedule);
  1598. let mut dirty_store_processing_time = Measure::start("dirty_store_processing");
  1599. let max_root_inclusive = self.accounts_index.max_root_inclusive();
  1600. let max_slot_inclusive = max_clean_root_inclusive.unwrap_or(max_root_inclusive);
  1601. let mut dirty_stores = Vec::with_capacity(self.dirty_stores.len());
  1602. // find the oldest dirty slot
  1603. // we'll add logging if that append vec cannot be marked dead
  1604. let mut min_dirty_slot = None::<u64>;
  1605. self.dirty_stores.retain(|slot, store| {
  1606. if *slot > max_slot_inclusive {
  1607. true
  1608. } else {
  1609. min_dirty_slot = min_dirty_slot.map(|min| min.min(*slot)).or(Some(*slot));
  1610. dirty_stores.push((*slot, store.clone()));
  1611. false
  1612. }
  1613. });
  1614. let dirty_stores_len = dirty_stores.len();
  1615. let num_bins = self.accounts_index.bins();
  1616. let candidates: Box<_> =
  1617. std::iter::repeat_with(|| RwLock::new(HashMap::<Pubkey, CleaningInfo>::new()))
  1618. .take(num_bins)
  1619. .collect();
  1620. let insert_candidate = |pubkey, is_zero_lamport| {
  1621. let index = self.accounts_index.bin_calculator.bin_from_pubkey(&pubkey);
  1622. let mut candidates_bin = candidates[index].write().unwrap();
  1623. candidates_bin
  1624. .entry(pubkey)
  1625. .or_default()
  1626. .might_contain_zero_lamport_entry |= is_zero_lamport;
  1627. };
  1628. let dirty_ancient_stores = AtomicUsize::default();
  1629. let mut dirty_store_routine = || {
  1630. let chunk_size = 1.max(dirty_stores_len.saturating_div(rayon::current_num_threads()));
  1631. let oldest_dirty_slots: Vec<u64> = dirty_stores
  1632. .par_chunks(chunk_size)
  1633. .map(|dirty_store_chunk| {
  1634. let mut oldest_dirty_slot = max_slot_inclusive.saturating_add(1);
  1635. dirty_store_chunk.iter().for_each(|(slot, store)| {
  1636. if *slot < oldest_non_ancient_slot {
  1637. dirty_ancient_stores.fetch_add(1, Ordering::Relaxed);
  1638. }
  1639. oldest_dirty_slot = oldest_dirty_slot.min(*slot);
  1640. store
  1641. .accounts
  1642. .scan_accounts_without_data(|_offset, account| {
  1643. let pubkey = *account.pubkey();
  1644. let is_zero_lamport = account.is_zero_lamport();
  1645. insert_candidate(pubkey, is_zero_lamport);
  1646. })
  1647. .expect("must scan accounts storage");
  1648. });
  1649. oldest_dirty_slot
  1650. })
  1651. .collect();
  1652. timings.oldest_dirty_slot = *oldest_dirty_slots
  1653. .iter()
  1654. .min()
  1655. .unwrap_or(&max_slot_inclusive.saturating_add(1));
  1656. };
  1657. if is_startup {
  1658. // Free to consume all the cores during startup
  1659. dirty_store_routine();
  1660. } else {
  1661. self.thread_pool_background.install(|| {
  1662. dirty_store_routine();
  1663. });
  1664. }
  1665. timings.dirty_pubkeys_count = Self::count_pubkeys(&candidates);
  1666. trace!(
  1667. "dirty_stores.len: {} pubkeys.len: {}",
  1668. dirty_stores_len,
  1669. timings.dirty_pubkeys_count,
  1670. );
  1671. dirty_store_processing_time.stop();
  1672. timings.dirty_store_processing_us += dirty_store_processing_time.as_us();
  1673. timings.dirty_ancient_stores = dirty_ancient_stores.load(Ordering::Relaxed);
  1674. let mut collect_delta_keys = Measure::start("key_create");
  1675. self.remove_uncleaned_slots_up_to_slot_and_move_pubkeys(max_slot_inclusive, &candidates);
  1676. collect_delta_keys.stop();
  1677. timings.collect_delta_keys_us += collect_delta_keys.as_us();
  1678. timings.delta_key_count = Self::count_pubkeys(&candidates);
  1679. // Check if we should purge any of the
  1680. // zero_lamport_accounts_to_purge_later, based on the
  1681. // latest_full_snapshot_slot.
  1682. let latest_full_snapshot_slot = self.latest_full_snapshot_slot();
  1683. assert!(
  1684. latest_full_snapshot_slot.is_some()
  1685. || self
  1686. .zero_lamport_accounts_to_purge_after_full_snapshot
  1687. .is_empty(),
  1688. "if snapshots are disabled, then zero_lamport_accounts_to_purge_later should always \
  1689. be empty"
  1690. );
  1691. if let Some(latest_full_snapshot_slot) = latest_full_snapshot_slot {
  1692. self.zero_lamport_accounts_to_purge_after_full_snapshot
  1693. .retain(|(slot, pubkey)| {
  1694. let is_candidate_for_clean =
  1695. max_slot_inclusive >= *slot && latest_full_snapshot_slot >= *slot;
  1696. if is_candidate_for_clean {
  1697. insert_candidate(*pubkey, true);
  1698. }
  1699. !is_candidate_for_clean
  1700. });
  1701. }
  1702. (candidates, min_dirty_slot)
  1703. }
  1704. /// called with cli argument to verify refcounts are correct on all accounts
  1705. /// this is very slow
  1706. /// this function will call Rayon par_iter, so you will want to have thread pool installed if
  1707. /// you want to call this without consuming all the cores on the CPU.
  1708. fn exhaustively_verify_refcounts(&self, max_slot_inclusive: Option<Slot>) {
  1709. let max_slot_inclusive =
  1710. max_slot_inclusive.unwrap_or_else(|| self.accounts_index.max_root_inclusive());
  1711. info!("exhaustively verifying refcounts as of slot: {max_slot_inclusive}");
  1712. let pubkey_refcount = DashMap::<Pubkey, Vec<Slot>>::default();
  1713. let mut storages = self.storage.all_storages();
  1714. storages.retain(|s| s.slot() <= max_slot_inclusive);
  1715. // populate
  1716. storages.par_iter().for_each_init(
  1717. || Box::new(append_vec::new_scan_accounts_reader()),
  1718. |reader, storage| {
  1719. let slot = storage.slot();
  1720. storage
  1721. .accounts
  1722. .scan_accounts(reader.as_mut(), |_offset, account| {
  1723. let pk = account.pubkey();
  1724. match pubkey_refcount.entry(*pk) {
  1725. dashmap::mapref::entry::Entry::Occupied(mut occupied_entry) => {
  1726. if !occupied_entry.get().iter().any(|s| s == &slot) {
  1727. occupied_entry.get_mut().push(slot);
  1728. }
  1729. }
  1730. dashmap::mapref::entry::Entry::Vacant(vacant_entry) => {
  1731. vacant_entry.insert(vec![slot]);
  1732. }
  1733. }
  1734. })
  1735. .expect("must scan accounts storage")
  1736. },
  1737. );
  1738. let total = pubkey_refcount.len();
  1739. let failed = AtomicBool::default();
  1740. let threads = quarter_thread_count();
  1741. let per_batch = total / threads;
  1742. (0..=threads).into_par_iter().for_each(|attempt| {
  1743. pubkey_refcount
  1744. .iter()
  1745. .skip(attempt * per_batch)
  1746. .take(per_batch)
  1747. .for_each(|entry| {
  1748. if failed.load(Ordering::Relaxed) {
  1749. return;
  1750. }
  1751. self.accounts_index
  1752. .get_and_then(entry.key(), |index_entry| {
  1753. if let Some(index_entry) = index_entry {
  1754. match (index_entry.ref_count() as usize).cmp(&entry.value().len()) {
  1755. std::cmp::Ordering::Equal => {
  1756. // ref counts match, nothing to do here
  1757. }
  1758. std::cmp::Ordering::Greater => {
  1759. let slot_list = index_entry.slot_list_read_lock();
  1760. let num_too_new = slot_list
  1761. .iter()
  1762. .filter(|(slot, _)| slot > &max_slot_inclusive)
  1763. .count();
  1764. if ((index_entry.ref_count() as usize) - num_too_new)
  1765. > entry.value().len()
  1766. {
  1767. failed.store(true, Ordering::Relaxed);
  1768. error!(
  1769. "exhaustively_verify_refcounts: {} refcount too \
  1770. large: {}, should be: {}, {:?}, {:?}, too_new: \
  1771. {num_too_new}",
  1772. entry.key(),
  1773. index_entry.ref_count(),
  1774. entry.value().len(),
  1775. *entry.value(),
  1776. slot_list
  1777. );
  1778. }
  1779. }
  1780. std::cmp::Ordering::Less => {
  1781. error!(
  1782. "exhaustively_verify_refcounts: {} refcount too \
  1783. small: {}, should be: {}, {:?}, {:?}",
  1784. entry.key(),
  1785. index_entry.ref_count(),
  1786. entry.value().len(),
  1787. *entry.value(),
  1788. index_entry.slot_list_read_lock()
  1789. );
  1790. }
  1791. }
  1792. };
  1793. (false, ())
  1794. });
  1795. });
  1796. });
  1797. if failed.load(Ordering::Relaxed) {
  1798. panic!("exhaustively_verify_refcounts failed");
  1799. }
  1800. }
  1801. // Purge zero lamport accounts and older rooted account states as garbage
  1802. // collection
  1803. // Only remove those accounts where the entire rooted history of the account
  1804. // can be purged because there are no live append vecs in the ancestors
  1805. pub fn clean_accounts(
  1806. &self,
  1807. max_clean_root_inclusive: Option<Slot>,
  1808. is_startup: bool,
  1809. epoch_schedule: &EpochSchedule,
  1810. ) {
  1811. if self.exhaustively_verify_refcounts {
  1812. //at startup use all cores to verify refcounts
  1813. if is_startup {
  1814. self.exhaustively_verify_refcounts(max_clean_root_inclusive);
  1815. } else {
  1816. // otherwise, use the background thread pool
  1817. self.thread_pool_background
  1818. .install(|| self.exhaustively_verify_refcounts(max_clean_root_inclusive));
  1819. }
  1820. }
  1821. let _guard = self.active_stats.activate(ActiveStatItem::Clean);
  1822. let ancient_account_cleans = AtomicU64::default();
  1823. let purges_old_accounts_count = AtomicU64::default();
  1824. let mut measure_all = Measure::start("clean_accounts");
  1825. let max_clean_root_inclusive = self.max_clean_root(max_clean_root_inclusive);
  1826. self.report_store_stats();
  1827. let active_guard = self
  1828. .active_stats
  1829. .activate(ActiveStatItem::CleanConstructCandidates);
  1830. let mut measure_construct_candidates = Measure::start("construct_candidates");
  1831. let mut key_timings = CleanKeyTimings::default();
  1832. let (mut candidates, min_dirty_slot) = self.construct_candidate_clean_keys(
  1833. max_clean_root_inclusive,
  1834. is_startup,
  1835. &mut key_timings,
  1836. epoch_schedule,
  1837. );
  1838. measure_construct_candidates.stop();
  1839. drop(active_guard);
  1840. let num_candidates = Self::count_pubkeys(&candidates);
  1841. let found_not_zero_accum = AtomicU64::new(0);
  1842. let not_found_on_fork_accum = AtomicU64::new(0);
  1843. let missing_accum = AtomicU64::new(0);
  1844. let useful_accum = AtomicU64::new(0);
  1845. let reclaims: SlotList<AccountInfo> = SlotList::with_capacity(num_candidates as usize);
  1846. let reclaims = Mutex::new(reclaims);
  1847. let pubkeys_removed_from_accounts_index: PubkeysRemovedFromAccountsIndex = HashSet::new();
  1848. let pubkeys_removed_from_accounts_index = Mutex::new(pubkeys_removed_from_accounts_index);
  1849. // parallel scan the index.
  1850. let do_clean_scan = || {
  1851. candidates.par_iter().for_each(|candidates_bin| {
  1852. let mut found_not_zero = 0;
  1853. let mut not_found_on_fork = 0;
  1854. let mut missing = 0;
  1855. let mut useful = 0;
  1856. let mut purges_old_accounts_local = 0;
  1857. let mut candidates_bin = candidates_bin.write().unwrap();
  1858. // Iterate over each HashMap entry to
  1859. // avoid capturing the HashMap in the
  1860. // closure passed to scan thus making
  1861. // conflicting read and write borrows.
  1862. candidates_bin.retain(|candidate_pubkey, candidate_info| {
  1863. let mut should_collect_reclaims = false;
  1864. self.accounts_index.scan(
  1865. iter::once(candidate_pubkey),
  1866. |_candidate_pubkey, slot_list_and_ref_count| {
  1867. let mut useless = true;
  1868. if let Some((slot_list, ref_count)) = slot_list_and_ref_count {
  1869. // find the highest rooted slot in the slot list
  1870. let index_in_slot_list = self.accounts_index.latest_slot(
  1871. None,
  1872. slot_list,
  1873. max_clean_root_inclusive,
  1874. );
  1875. match index_in_slot_list {
  1876. Some(index_in_slot_list) => {
  1877. // found info relative to max_clean_root
  1878. let (slot, account_info) = &slot_list[index_in_slot_list];
  1879. if account_info.is_zero_lamport() {
  1880. useless = false;
  1881. // The latest one is zero lamports. We may be able to purge it.
  1882. // Add all the rooted entries that contain this pubkey.
  1883. // We know the highest rooted entry is zero lamports.
  1884. candidate_info.slot_list =
  1885. self.accounts_index.get_rooted_entries(
  1886. slot_list,
  1887. max_clean_root_inclusive,
  1888. );
  1889. candidate_info.ref_count = ref_count;
  1890. } else {
  1891. found_not_zero += 1;
  1892. }
  1893. // If this candidate has multiple rooted slot list entries,
  1894. // we should reclaim the older ones.
  1895. if slot_list.len() > 1
  1896. && *slot
  1897. <= max_clean_root_inclusive.unwrap_or(Slot::MAX)
  1898. {
  1899. should_collect_reclaims = true;
  1900. purges_old_accounts_local += 1;
  1901. useless = false;
  1902. }
  1903. }
  1904. None => {
  1905. // This pubkey is in the index but not in a root slot, so clean
  1906. // it up by adding it to the to-be-purged list.
  1907. //
  1908. // Also, this pubkey must have been touched by some slot since
  1909. // it was in the dirty list, so we assume that the slot it was
  1910. // touched in must be unrooted.
  1911. not_found_on_fork += 1;
  1912. should_collect_reclaims = true;
  1913. purges_old_accounts_local += 1;
  1914. useless = false;
  1915. }
  1916. }
  1917. } else {
  1918. missing += 1;
  1919. }
  1920. if !useless {
  1921. useful += 1;
  1922. }
  1923. AccountsIndexScanResult::OnlyKeepInMemoryIfDirty
  1924. },
  1925. None,
  1926. if candidate_info.might_contain_zero_lamport_entry {
  1927. ScanFilter::All
  1928. } else {
  1929. self.scan_filter_for_shrinking
  1930. },
  1931. );
  1932. if should_collect_reclaims {
  1933. let reclaims_new = self.collect_reclaims(
  1934. candidate_pubkey,
  1935. max_clean_root_inclusive,
  1936. &ancient_account_cleans,
  1937. epoch_schedule,
  1938. &pubkeys_removed_from_accounts_index,
  1939. );
  1940. if !reclaims_new.is_empty() {
  1941. reclaims.lock().unwrap().extend(reclaims_new);
  1942. }
  1943. }
  1944. !candidate_info.slot_list.is_empty()
  1945. });
  1946. found_not_zero_accum.fetch_add(found_not_zero, Ordering::Relaxed);
  1947. not_found_on_fork_accum.fetch_add(not_found_on_fork, Ordering::Relaxed);
  1948. missing_accum.fetch_add(missing, Ordering::Relaxed);
  1949. useful_accum.fetch_add(useful, Ordering::Relaxed);
  1950. purges_old_accounts_count.fetch_add(purges_old_accounts_local, Ordering::Relaxed);
  1951. });
  1952. };
  1953. let active_guard = self
  1954. .active_stats
  1955. .activate(ActiveStatItem::CleanScanCandidates);
  1956. let mut accounts_scan = Measure::start("accounts_scan");
  1957. if is_startup {
  1958. do_clean_scan();
  1959. } else {
  1960. self.thread_pool_background.install(do_clean_scan);
  1961. }
  1962. accounts_scan.stop();
  1963. drop(active_guard);
  1964. // strip the RwLock from the candidate bins now that we no longer need it
  1965. let mut candidates: Box<_> = candidates
  1966. .iter_mut()
  1967. .map(|candidates_bin| mem::take(candidates_bin.get_mut().unwrap()))
  1968. .collect();
  1969. let retained_keys_count: usize = candidates.iter().map(HashMap::len).sum();
  1970. let reclaims = reclaims.into_inner().unwrap();
  1971. let mut pubkeys_removed_from_accounts_index =
  1972. pubkeys_removed_from_accounts_index.into_inner().unwrap();
  1973. let active_guard = self.active_stats.activate(ActiveStatItem::CleanOldAccounts);
  1974. let mut clean_old_rooted = Measure::start("clean_old_roots");
  1975. let (purged_account_slots, removed_accounts) =
  1976. self.clean_accounts_older_than_root(&reclaims, &pubkeys_removed_from_accounts_index);
  1977. clean_old_rooted.stop();
  1978. drop(active_guard);
  1979. // Calculate store counts as if everything was purged
  1980. // Then purge if we can
  1981. let active_guard = self
  1982. .active_stats
  1983. .activate(ActiveStatItem::CleanCollectStoreCounts);
  1984. let mut store_counts_time = Measure::start("store_counts");
  1985. let mut store_counts: HashMap<Slot, (usize, HashSet<Pubkey>)> = HashMap::new();
  1986. for candidates_bin in candidates.iter_mut() {
  1987. for (pubkey, cleaning_info) in candidates_bin.iter_mut() {
  1988. let slot_list = &mut cleaning_info.slot_list;
  1989. let ref_count = &mut cleaning_info.ref_count;
  1990. debug_assert!(!slot_list.is_empty(), "candidate slot_list can't be empty");
  1991. if purged_account_slots.contains_key(pubkey) {
  1992. *ref_count = self.accounts_index.ref_count_from_storage(pubkey);
  1993. }
  1994. slot_list.retain(|(slot, account_info)| {
  1995. let was_slot_purged = purged_account_slots
  1996. .get(pubkey)
  1997. .map(|slots_removed| slots_removed.contains(slot))
  1998. .unwrap_or(false);
  1999. if was_slot_purged {
  2000. // No need to look up the slot storage below if the entire
  2001. // slot was purged
  2002. return false;
  2003. }
  2004. // Check if this update in `slot` to the account with `key` was reclaimed earlier by
  2005. // `clean_accounts_older_than_root()`
  2006. let was_reclaimed = removed_accounts
  2007. .get(slot)
  2008. .map(|store_removed| store_removed.contains(&account_info.offset()))
  2009. .unwrap_or(false);
  2010. if was_reclaimed {
  2011. return false;
  2012. }
  2013. if let Some(store_count) = store_counts.get_mut(slot) {
  2014. store_count.0 -= 1;
  2015. store_count.1.insert(*pubkey);
  2016. } else {
  2017. let mut key_set = HashSet::new();
  2018. key_set.insert(*pubkey);
  2019. assert!(
  2020. !account_info.is_cached(),
  2021. "The Accounts Cache must be flushed first for this account info. \
  2022. pubkey: {}, slot: {}",
  2023. *pubkey,
  2024. *slot
  2025. );
  2026. let count = self
  2027. .storage
  2028. .get_account_storage_entry(*slot, account_info.store_id())
  2029. .map(|store| store.count())
  2030. .unwrap()
  2031. - 1;
  2032. debug!(
  2033. "store_counts, inserting slot: {}, store id: {}, count: {}",
  2034. slot,
  2035. account_info.store_id(),
  2036. count
  2037. );
  2038. store_counts.insert(*slot, (count, key_set));
  2039. }
  2040. true
  2041. });
  2042. }
  2043. }
  2044. store_counts_time.stop();
  2045. drop(active_guard);
  2046. let active_guard = self
  2047. .active_stats
  2048. .activate(ActiveStatItem::CleanCalcDeleteDeps);
  2049. let mut calc_deps_time = Measure::start("calc_deps");
  2050. self.calc_delete_dependencies(&candidates, &mut store_counts, min_dirty_slot);
  2051. calc_deps_time.stop();
  2052. drop(active_guard);
  2053. let active_guard = self
  2054. .active_stats
  2055. .activate(ActiveStatItem::CleanFilterZeroLamport);
  2056. let mut purge_filter = Measure::start("purge_filter");
  2057. self.filter_zero_lamport_clean_for_incremental_snapshots(
  2058. max_clean_root_inclusive,
  2059. &store_counts,
  2060. &mut candidates,
  2061. );
  2062. purge_filter.stop();
  2063. drop(active_guard);
  2064. let active_guard = self.active_stats.activate(ActiveStatItem::CleanReclaims);
  2065. let mut reclaims_time = Measure::start("reclaims");
  2066. // Recalculate reclaims with new purge set
  2067. let mut pubkey_to_slot_set = Vec::new();
  2068. for candidates_bin in candidates {
  2069. let mut bin_set = candidates_bin
  2070. .into_iter()
  2071. .filter_map(|(pubkey, cleaning_info)| {
  2072. let slot_list = cleaning_info.slot_list;
  2073. (!slot_list.is_empty()).then_some((
  2074. pubkey,
  2075. slot_list
  2076. .iter()
  2077. .map(|(slot, _)| *slot)
  2078. .collect::<HashSet<Slot>>(),
  2079. ))
  2080. })
  2081. .collect::<Vec<_>>();
  2082. pubkey_to_slot_set.append(&mut bin_set);
  2083. }
  2084. let (reclaims, pubkeys_removed_from_accounts_index2) =
  2085. self.purge_keys_exact(pubkey_to_slot_set);
  2086. pubkeys_removed_from_accounts_index.extend(pubkeys_removed_from_accounts_index2);
  2087. if !reclaims.is_empty() {
  2088. self.handle_reclaims(
  2089. reclaims.iter(),
  2090. None,
  2091. &pubkeys_removed_from_accounts_index,
  2092. HandleReclaims::ProcessDeadSlots(&self.clean_accounts_stats.purge_stats),
  2093. MarkAccountsObsolete::No,
  2094. );
  2095. }
  2096. reclaims_time.stop();
  2097. drop(active_guard);
  2098. measure_all.stop();
  2099. self.clean_accounts_stats.report();
  2100. datapoint_info!(
  2101. "clean_accounts",
  2102. ("max_clean_root", max_clean_root_inclusive, Option<i64>),
  2103. ("total_us", measure_all.as_us(), i64),
  2104. (
  2105. "collect_delta_keys_us",
  2106. key_timings.collect_delta_keys_us,
  2107. i64
  2108. ),
  2109. ("oldest_dirty_slot", key_timings.oldest_dirty_slot, i64),
  2110. (
  2111. "pubkeys_removed_from_accounts_index",
  2112. pubkeys_removed_from_accounts_index.len(),
  2113. i64
  2114. ),
  2115. (
  2116. "dirty_ancient_stores",
  2117. key_timings.dirty_ancient_stores,
  2118. i64
  2119. ),
  2120. (
  2121. "dirty_store_processing_us",
  2122. key_timings.dirty_store_processing_us,
  2123. i64
  2124. ),
  2125. ("construct_candidates_us", measure_construct_candidates.as_us(), i64),
  2126. ("accounts_scan", accounts_scan.as_us(), i64),
  2127. ("clean_old_rooted", clean_old_rooted.as_us(), i64),
  2128. ("store_counts", store_counts_time.as_us(), i64),
  2129. ("purge_filter", purge_filter.as_us(), i64),
  2130. ("calc_deps", calc_deps_time.as_us(), i64),
  2131. ("reclaims", reclaims_time.as_us(), i64),
  2132. ("delta_insert_us", key_timings.delta_insert_us, i64),
  2133. ("delta_key_count", key_timings.delta_key_count, i64),
  2134. ("dirty_pubkeys_count", key_timings.dirty_pubkeys_count, i64),
  2135. ("useful_keys", useful_accum.load(Ordering::Relaxed), i64),
  2136. ("total_keys_count", num_candidates, i64),
  2137. ("retained_keys_count", retained_keys_count, i64),
  2138. (
  2139. "scan_found_not_zero",
  2140. found_not_zero_accum.load(Ordering::Relaxed),
  2141. i64
  2142. ),
  2143. (
  2144. "scan_not_found_on_fork",
  2145. not_found_on_fork_accum.load(Ordering::Relaxed),
  2146. i64
  2147. ),
  2148. ("scan_missing", missing_accum.load(Ordering::Relaxed), i64),
  2149. (
  2150. "get_account_sizes_us",
  2151. self.clean_accounts_stats
  2152. .get_account_sizes_us
  2153. .swap(0, Ordering::Relaxed),
  2154. i64
  2155. ),
  2156. (
  2157. "slots_cleaned",
  2158. self.clean_accounts_stats
  2159. .slots_cleaned
  2160. .swap(0, Ordering::Relaxed),
  2161. i64
  2162. ),
  2163. (
  2164. "clean_old_root_us",
  2165. self.clean_accounts_stats
  2166. .clean_old_root_us
  2167. .swap(0, Ordering::Relaxed),
  2168. i64
  2169. ),
  2170. (
  2171. "clean_old_root_reclaim_us",
  2172. self.clean_accounts_stats
  2173. .clean_old_root_reclaim_us
  2174. .swap(0, Ordering::Relaxed),
  2175. i64
  2176. ),
  2177. (
  2178. "remove_dead_accounts_remove_us",
  2179. self.clean_accounts_stats
  2180. .remove_dead_accounts_remove_us
  2181. .swap(0, Ordering::Relaxed),
  2182. i64
  2183. ),
  2184. (
  2185. "remove_dead_accounts_shrink_us",
  2186. self.clean_accounts_stats
  2187. .remove_dead_accounts_shrink_us
  2188. .swap(0, Ordering::Relaxed),
  2189. i64
  2190. ),
  2191. (
  2192. "clean_stored_dead_slots_us",
  2193. self.clean_accounts_stats
  2194. .clean_stored_dead_slots_us
  2195. .swap(0, Ordering::Relaxed),
  2196. i64
  2197. ),
  2198. (
  2199. "roots_added",
  2200. self.accounts_index.roots_added.swap(0, Ordering::Relaxed),
  2201. i64
  2202. ),
  2203. (
  2204. "purge_older_root_entries_one_slot_list",
  2205. self.accounts_index
  2206. .purge_older_root_entries_one_slot_list
  2207. .swap(0, Ordering::Relaxed),
  2208. i64
  2209. ),
  2210. (
  2211. "roots_removed",
  2212. self.accounts_index.roots_removed.swap(0, Ordering::Relaxed),
  2213. i64
  2214. ),
  2215. (
  2216. "active_scans",
  2217. self.accounts_index.active_scans.load(Ordering::Relaxed),
  2218. i64
  2219. ),
  2220. (
  2221. "max_distance_to_min_scan_slot",
  2222. self.accounts_index
  2223. .max_distance_to_min_scan_slot
  2224. .swap(0, Ordering::Relaxed),
  2225. i64
  2226. ),
  2227. (
  2228. "ancient_account_cleans",
  2229. ancient_account_cleans.load(Ordering::Relaxed),
  2230. i64
  2231. ),
  2232. (
  2233. "purges_old_accounts_count",
  2234. purges_old_accounts_count.load(Ordering::Relaxed),
  2235. i64
  2236. ),
  2237. ("next_store_id", self.next_id.load(Ordering::Relaxed), i64),
  2238. );
  2239. }
  2240. /// Removes the accounts in the input `reclaims` from the tracked "count" of
  2241. /// their corresponding storage entries. Note this does not actually free
  2242. /// the memory from the storage entries until all the storage entries for
  2243. /// a given slot `S` are empty, at which point `process_dead_slots` will
  2244. /// remove all the storage entries for `S`.
  2245. ///
  2246. /// # Arguments
  2247. /// * `reclaims` - The accounts to remove from storage entries' "count". Note here
  2248. /// that we should not remove cache entries, only entries for accounts actually
  2249. /// stored in a storage entry.
  2250. /// * `expected_single_dead_slot` - A correctness assertion. If this is equal to `Some(S)`,
  2251. /// then the function will check that the only slot being cleaned up in `reclaims`
  2252. /// is the slot == `S`. This is true for instance when `handle_reclaims` is called
  2253. /// from store or slot shrinking, as those should only touch the slot they are
  2254. /// currently storing to or shrinking.
  2255. /// * `pubkeys_removed_from_accounts_index` - These keys have already been removed from the
  2256. /// accounts index and should not be unref'd. If they exist in the accounts index,
  2257. /// they are NEW.
  2258. /// * `handle_reclaims`. `purge_stats` are stats used to track performance of purging
  2259. /// dead slots if value is `ProcessDeadSlots`.
  2260. /// Otherwise, there can be no dead slots
  2261. /// that happen as a result of this call, and the function will check that no slots are
  2262. /// cleaned up/removed via `process_dead_slots`. For instance, on store, no slots should
  2263. /// be cleaned up, but during the background clean accounts purges accounts from old rooted
  2264. /// slots, so outdated slots may be removed.
  2265. /// * 'mark_accounts_obsolete' - Whether to mark accounts as obsolete or not. If `Yes`, then
  2266. /// obsolete account entry will be marked in the storage so snapshots/accounts hash can
  2267. /// determine the state of the account at a specified slot. This should only be done if the
  2268. /// account is already unrefed and removed from the accounts index
  2269. /// It must be unrefed and removed to avoid double counting or missed counting in shrink
  2270. fn handle_reclaims<'a, I>(
  2271. &'a self,
  2272. reclaims: I,
  2273. expected_single_dead_slot: Option<Slot>,
  2274. pubkeys_removed_from_accounts_index: &PubkeysRemovedFromAccountsIndex,
  2275. handle_reclaims: HandleReclaims<'a>,
  2276. mark_accounts_obsolete: MarkAccountsObsolete,
  2277. ) -> ReclaimResult
  2278. where
  2279. I: Iterator<Item = &'a (Slot, AccountInfo)>,
  2280. {
  2281. let mut reclaim_result = ReclaimResult::default();
  2282. let (dead_slots, reclaimed_offsets) =
  2283. self.remove_dead_accounts(reclaims, expected_single_dead_slot, mark_accounts_obsolete);
  2284. reclaim_result.1 = reclaimed_offsets;
  2285. let HandleReclaims::ProcessDeadSlots(purge_stats) = handle_reclaims;
  2286. if let Some(expected_single_dead_slot) = expected_single_dead_slot {
  2287. assert!(dead_slots.len() <= 1);
  2288. if dead_slots.len() == 1 {
  2289. assert!(dead_slots.contains(&expected_single_dead_slot));
  2290. }
  2291. }
  2292. // if we are marking accounts obsolete, then any dead slots have already been cleaned
  2293. let clean_stored_dead_slots =
  2294. !matches!(mark_accounts_obsolete, MarkAccountsObsolete::Yes(_));
  2295. self.process_dead_slots(
  2296. &dead_slots,
  2297. Some(&mut reclaim_result.0),
  2298. purge_stats,
  2299. pubkeys_removed_from_accounts_index,
  2300. clean_stored_dead_slots,
  2301. );
  2302. reclaim_result
  2303. }
  2304. /// During clean, some zero-lamport accounts that are marked for purge should *not* actually
  2305. /// get purged. Filter out those accounts here by removing them from 'candidates'.
  2306. /// Candidates may contain entries with empty slots list in CleaningInfo.
  2307. /// The function removes such entries from 'candidates'.
  2308. ///
  2309. /// When using incremental snapshots, do not purge zero-lamport accounts if the slot is higher
  2310. /// than the latest full snapshot slot. This is to protect against the following scenario:
  2311. ///
  2312. /// ```text
  2313. /// A full snapshot is taken, including account 'alpha' with a non-zero balance. In a later slot,
  2314. /// alpha's lamports go to zero. Eventually, cleaning runs. Without this change,
  2315. /// alpha would be cleaned up and removed completely. Finally, an incremental snapshot is taken.
  2316. ///
  2317. /// Later, the incremental and full snapshots are used to rebuild the bank and accounts
  2318. /// database (e.x. if the node restarts). The full snapshot _does_ contain alpha
  2319. /// and its balance is non-zero. However, since alpha was cleaned up in a slot after the full
  2320. /// snapshot slot (due to having zero lamports), the incremental snapshot would not contain alpha.
  2321. /// Thus, the accounts database will contain the old, incorrect info for alpha with a non-zero
  2322. /// balance. Very bad!
  2323. /// ```
  2324. ///
  2325. /// This filtering step can be skipped if there is no `latest_full_snapshot_slot`, or if the
  2326. /// `max_clean_root_inclusive` is less-than-or-equal-to the `latest_full_snapshot_slot`.
  2327. fn filter_zero_lamport_clean_for_incremental_snapshots(
  2328. &self,
  2329. max_clean_root_inclusive: Option<Slot>,
  2330. store_counts: &HashMap<Slot, (usize, HashSet<Pubkey>)>,
  2331. candidates: &mut [HashMap<Pubkey, CleaningInfo>],
  2332. ) {
  2333. let latest_full_snapshot_slot = self.latest_full_snapshot_slot();
  2334. let should_filter_for_incremental_snapshots = max_clean_root_inclusive.unwrap_or(Slot::MAX)
  2335. > latest_full_snapshot_slot.unwrap_or(Slot::MAX);
  2336. assert!(
  2337. latest_full_snapshot_slot.is_some() || !should_filter_for_incremental_snapshots,
  2338. "if filtering for incremental snapshots, then snapshots should be enabled",
  2339. );
  2340. for bin in candidates {
  2341. bin.retain(|pubkey, cleaning_info| {
  2342. let slot_list = &cleaning_info.slot_list;
  2343. debug_assert!(!slot_list.is_empty(), "candidate slot_list can't be empty");
  2344. // Only keep candidates where the entire history of the account in the root set
  2345. // can be purged. All AppendVecs for those updates are dead.
  2346. for (slot, _account_info) in slot_list.iter() {
  2347. if let Some(store_count) = store_counts.get(slot) {
  2348. if store_count.0 != 0 {
  2349. // one store this pubkey is in is not being removed, so this pubkey cannot be removed at all
  2350. return false;
  2351. }
  2352. } else {
  2353. // store is not being removed, so this pubkey cannot be removed at all
  2354. return false;
  2355. }
  2356. }
  2357. // Exit early if not filtering more for incremental snapshots
  2358. if !should_filter_for_incremental_snapshots {
  2359. return true;
  2360. }
  2361. // Safety: We exited early if the slot list was empty,
  2362. // so we're guaranteed here that `.max_by_key()` returns Some.
  2363. let (slot, account_info) = slot_list
  2364. .iter()
  2365. .max_by_key(|(slot, _account_info)| slot)
  2366. .unwrap();
  2367. // Do *not* purge zero-lamport accounts if the slot is greater than the last full
  2368. // snapshot slot. Since we're `retain`ing the accounts-to-purge, I felt creating
  2369. // the `cannot_purge` variable made this easier to understand. Accounts that do
  2370. // not get purged here are added to a list so they be considered for purging later
  2371. // (i.e. after the next full snapshot).
  2372. assert!(account_info.is_zero_lamport());
  2373. let cannot_purge = *slot > latest_full_snapshot_slot.unwrap();
  2374. if cannot_purge {
  2375. self.zero_lamport_accounts_to_purge_after_full_snapshot
  2376. .insert((*slot, *pubkey));
  2377. }
  2378. !cannot_purge
  2379. });
  2380. }
  2381. }
  2382. // Must be kept private!, does sensitive cleanup that should only be called from
  2383. // supported pipelines in AccountsDb
  2384. /// pubkeys_removed_from_accounts_index - These keys have already been removed from the accounts index
  2385. /// and should not be unref'd. If they exist in the accounts index, they are NEW.
  2386. /// clean_stored_dead_slots - clean_stored_dead_slots iterates through all the pubkeys in the dead
  2387. /// slots and unrefs them in the acocunts index if they are not present in
  2388. /// pubkeys_removed_from_accounts_index. Skipping clean is the equivilent to
  2389. /// pubkeys_removed_from_accounts_index containing all the pubkeys in the dead slots
  2390. fn process_dead_slots(
  2391. &self,
  2392. dead_slots: &IntSet<Slot>,
  2393. purged_account_slots: Option<&mut AccountSlots>,
  2394. purge_stats: &PurgeStats,
  2395. pubkeys_removed_from_accounts_index: &PubkeysRemovedFromAccountsIndex,
  2396. clean_stored_dead_slots: bool,
  2397. ) {
  2398. if dead_slots.is_empty() {
  2399. return;
  2400. }
  2401. let mut clean_dead_slots = Measure::start("reclaims::clean_dead_slots");
  2402. if clean_stored_dead_slots {
  2403. self.clean_stored_dead_slots(
  2404. dead_slots,
  2405. purged_account_slots,
  2406. pubkeys_removed_from_accounts_index,
  2407. );
  2408. }
  2409. // Remove dead slots from the accounts index root tracker
  2410. self.remove_dead_slots_metadata(dead_slots.iter());
  2411. clean_dead_slots.stop();
  2412. let mut purge_removed_slots = Measure::start("reclaims::purge_removed_slots");
  2413. self.purge_dead_slots_from_storage(dead_slots.iter(), purge_stats);
  2414. purge_removed_slots.stop();
  2415. // If the slot is dead, remove the need to shrink the storages as
  2416. // the storage entries will be purged.
  2417. {
  2418. let mut list = self.shrink_candidate_slots.lock().unwrap();
  2419. for slot in dead_slots {
  2420. list.remove(slot);
  2421. }
  2422. }
  2423. debug!(
  2424. "process_dead_slots({}): {} {} {:?}",
  2425. dead_slots.len(),
  2426. clean_dead_slots,
  2427. purge_removed_slots,
  2428. dead_slots,
  2429. );
  2430. }
  2431. /// load the account index entry for the first `count` items in `accounts`
  2432. /// store a reference to all alive accounts in `alive_accounts`
  2433. /// store all pubkeys dead in `slot_to_shrink` in `pubkeys_to_unref`
  2434. /// return sum of account size for all alive accounts
  2435. fn load_accounts_index_for_shrink<'a, T: ShrinkCollectRefs<'a>>(
  2436. &self,
  2437. accounts: &'a [AccountFromStorage],
  2438. stats: &ShrinkStats,
  2439. slot_to_shrink: Slot,
  2440. ) -> LoadAccountsIndexForShrink<'a, T> {
  2441. let count = accounts.len();
  2442. let mut alive_accounts = T::with_capacity(count, slot_to_shrink);
  2443. let mut pubkeys_to_unref = Vec::with_capacity(count);
  2444. let mut zero_lamport_single_ref_pubkeys = Vec::with_capacity(count);
  2445. let mut alive = 0;
  2446. let mut dead = 0;
  2447. let mut index = 0;
  2448. let mut index_scan_returned_some_count = 0;
  2449. let mut index_scan_returned_none_count = 0;
  2450. let mut all_are_zero_lamports = true;
  2451. let latest_full_snapshot_slot = self.latest_full_snapshot_slot();
  2452. self.accounts_index.scan(
  2453. accounts.iter().map(|account| account.pubkey()),
  2454. |pubkey, slots_refs| {
  2455. let stored_account = &accounts[index];
  2456. let mut do_populate_accounts_for_shrink = |ref_count, slot_list| {
  2457. if stored_account.is_zero_lamport()
  2458. && ref_count == 1
  2459. && latest_full_snapshot_slot
  2460. .map(|latest_full_snapshot_slot| {
  2461. latest_full_snapshot_slot >= slot_to_shrink
  2462. })
  2463. .unwrap_or(true)
  2464. {
  2465. // only do this if our slot is prior to the latest full snapshot
  2466. // we found a zero lamport account that is the only instance of this account. We can delete it completely.
  2467. zero_lamport_single_ref_pubkeys.push(pubkey);
  2468. self.add_uncleaned_pubkeys_after_shrink(
  2469. slot_to_shrink,
  2470. [*pubkey].into_iter(),
  2471. );
  2472. } else {
  2473. all_are_zero_lamports &= stored_account.is_zero_lamport();
  2474. alive_accounts.add(ref_count, stored_account, slot_list);
  2475. alive += 1;
  2476. }
  2477. };
  2478. if let Some((slot_list, ref_count)) = slots_refs {
  2479. index_scan_returned_some_count += 1;
  2480. let is_alive = slot_list.iter().any(|(slot, _acct_info)| {
  2481. // if the accounts index contains an entry at this slot, then the append vec we're asking about contains this item and thus, it is alive at this slot
  2482. *slot == slot_to_shrink
  2483. });
  2484. if !is_alive {
  2485. // This pubkey was found in the storage, but no longer exists in the index.
  2486. // It would have had a ref to the storage from the initial store, but it will
  2487. // not exist in the re-written slot. Unref it to keep the index consistent with
  2488. // rewriting the storage entries.
  2489. pubkeys_to_unref.push(pubkey);
  2490. dead += 1;
  2491. } else {
  2492. do_populate_accounts_for_shrink(ref_count, slot_list);
  2493. }
  2494. } else {
  2495. index_scan_returned_none_count += 1;
  2496. // getting None here means the account is 'normal' and was written to disk. This means it must have ref_count=1 and
  2497. // slot_list.len() = 1. This means it must be alive in this slot. This is by far the most common case.
  2498. // Note that we could get Some(...) here if the account is in the in mem index because it is hot.
  2499. // Note this could also mean the account isn't on disk either. That would indicate a bug in accounts db.
  2500. // Account is alive.
  2501. let ref_count = 1;
  2502. let slot_list = [(slot_to_shrink, AccountInfo::default())];
  2503. do_populate_accounts_for_shrink(ref_count, &slot_list);
  2504. }
  2505. index += 1;
  2506. AccountsIndexScanResult::OnlyKeepInMemoryIfDirty
  2507. },
  2508. None,
  2509. self.scan_filter_for_shrinking,
  2510. );
  2511. assert_eq!(index, std::cmp::min(accounts.len(), count));
  2512. stats
  2513. .index_scan_returned_some
  2514. .fetch_add(index_scan_returned_some_count, Ordering::Relaxed);
  2515. stats
  2516. .index_scan_returned_none
  2517. .fetch_add(index_scan_returned_none_count, Ordering::Relaxed);
  2518. stats.alive_accounts.fetch_add(alive, Ordering::Relaxed);
  2519. stats.dead_accounts.fetch_add(dead, Ordering::Relaxed);
  2520. LoadAccountsIndexForShrink {
  2521. alive_accounts,
  2522. pubkeys_to_unref,
  2523. zero_lamport_single_ref_pubkeys,
  2524. all_are_zero_lamports,
  2525. }
  2526. }
  2527. /// get all accounts in all the storages passed in
  2528. /// for duplicate pubkeys, the account with the highest write_value is returned
  2529. pub fn get_unique_accounts_from_storage(
  2530. &self,
  2531. store: &AccountStorageEntry,
  2532. ) -> GetUniqueAccountsResult {
  2533. let capacity = store.capacity();
  2534. let mut stored_accounts = Vec::with_capacity(store.count());
  2535. store
  2536. .accounts
  2537. .scan_accounts_without_data(|offset, account| {
  2538. // file_id is unused and can be anything. We will always be loading whatever storage is in the slot.
  2539. let file_id = 0;
  2540. stored_accounts.push(AccountFromStorage {
  2541. index_info: AccountInfo::new(
  2542. StorageLocation::AppendVec(file_id, offset),
  2543. account.is_zero_lamport(),
  2544. ),
  2545. pubkey: *account.pubkey(),
  2546. data_len: account.data_len as u64,
  2547. });
  2548. })
  2549. .expect("must scan accounts storage");
  2550. // sort by pubkey to keep account index lookups close
  2551. let num_duplicated_accounts = Self::sort_and_remove_dups(&mut stored_accounts);
  2552. GetUniqueAccountsResult {
  2553. stored_accounts,
  2554. capacity,
  2555. num_duplicated_accounts,
  2556. }
  2557. }
  2558. #[cfg(feature = "dev-context-only-utils")]
  2559. pub fn set_storage_access(&mut self, storage_access: StorageAccess) {
  2560. self.storage_access = storage_access;
  2561. }
  2562. /// Sort `accounts` by pubkey and removes all but the *last* of consecutive
  2563. /// accounts in the vector with the same pubkey.
  2564. ///
  2565. /// Return the number of duplicated elements in the vector.
  2566. #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
  2567. fn sort_and_remove_dups(accounts: &mut Vec<AccountFromStorage>) -> usize {
  2568. // stable sort because we want the most recent only
  2569. accounts.sort_by(|a, b| a.pubkey().cmp(b.pubkey()));
  2570. let len0 = accounts.len();
  2571. if accounts.len() > 1 {
  2572. let mut last = 0;
  2573. let mut curr = 1;
  2574. while curr < accounts.len() {
  2575. if accounts[curr].pubkey() != accounts[last].pubkey() {
  2576. last += 1;
  2577. }
  2578. accounts[last] = accounts[curr];
  2579. curr += 1;
  2580. }
  2581. accounts.truncate(last + 1);
  2582. }
  2583. len0 - accounts.len()
  2584. }
  2585. pub(crate) fn get_unique_accounts_from_storage_for_shrink(
  2586. &self,
  2587. store: &AccountStorageEntry,
  2588. stats: &ShrinkStats,
  2589. ) -> GetUniqueAccountsResult {
  2590. let (result, storage_read_elapsed_us) =
  2591. measure_us!(self.get_unique_accounts_from_storage(store));
  2592. stats
  2593. .storage_read_elapsed
  2594. .fetch_add(storage_read_elapsed_us, Ordering::Relaxed);
  2595. stats
  2596. .num_duplicated_accounts
  2597. .fetch_add(result.num_duplicated_accounts as u64, Ordering::Relaxed);
  2598. result
  2599. }
  2600. /// shared code for shrinking normal slots and combining into ancient append vecs
  2601. /// note 'unique_accounts' is passed by ref so we can return references to data within it, avoiding self-references
  2602. pub(crate) fn shrink_collect<'a: 'b, 'b, T: ShrinkCollectRefs<'b>>(
  2603. &self,
  2604. store: &'a AccountStorageEntry,
  2605. unique_accounts: &'b mut GetUniqueAccountsResult,
  2606. stats: &ShrinkStats,
  2607. ) -> ShrinkCollect<'b, T> {
  2608. let slot = store.slot();
  2609. let GetUniqueAccountsResult {
  2610. stored_accounts,
  2611. capacity,
  2612. num_duplicated_accounts,
  2613. } = unique_accounts;
  2614. let mut index_read_elapsed = Measure::start("index_read_elapsed");
  2615. // Get a set of all obsolete offsets
  2616. // Slot is not needed, as all obsolete accounts can be considered
  2617. // dead for shrink. Zero lamport accounts are not marked obsolete
  2618. let obsolete_offsets: IntSet<_> = store
  2619. .obsolete_accounts_read_lock()
  2620. .filter_obsolete_accounts(None)
  2621. .map(|(offset, _)| offset)
  2622. .collect();
  2623. // Filter all the accounts that are marked obsolete
  2624. let total_starting_accounts = stored_accounts.len();
  2625. stored_accounts.retain(|account| !obsolete_offsets.contains(&account.index_info.offset()));
  2626. let len = stored_accounts.len();
  2627. let shrink_collect = Mutex::new(ShrinkCollect {
  2628. slot,
  2629. capacity: *capacity,
  2630. pubkeys_to_unref: Vec::with_capacity(len),
  2631. zero_lamport_single_ref_pubkeys: Vec::new(),
  2632. alive_accounts: T::with_capacity(len, slot),
  2633. total_starting_accounts,
  2634. all_are_zero_lamports: true,
  2635. alive_total_bytes: 0, // will be updated after `alive_accounts` is populated
  2636. });
  2637. stats
  2638. .accounts_loaded
  2639. .fetch_add(len as u64, Ordering::Relaxed);
  2640. stats
  2641. .obsolete_accounts_filtered
  2642. .fetch_add((total_starting_accounts - len) as u64, Ordering::Relaxed);
  2643. stats
  2644. .num_duplicated_accounts
  2645. .fetch_add(*num_duplicated_accounts as u64, Ordering::Relaxed);
  2646. self.thread_pool_background.install(|| {
  2647. stored_accounts
  2648. .par_chunks(SHRINK_COLLECT_CHUNK_SIZE)
  2649. .for_each(|stored_accounts| {
  2650. let LoadAccountsIndexForShrink {
  2651. alive_accounts,
  2652. mut pubkeys_to_unref,
  2653. all_are_zero_lamports,
  2654. mut zero_lamport_single_ref_pubkeys,
  2655. } = self.load_accounts_index_for_shrink(stored_accounts, stats, slot);
  2656. // collect
  2657. let mut shrink_collect = shrink_collect.lock().unwrap();
  2658. shrink_collect.alive_accounts.collect(alive_accounts);
  2659. shrink_collect
  2660. .pubkeys_to_unref
  2661. .append(&mut pubkeys_to_unref);
  2662. shrink_collect
  2663. .zero_lamport_single_ref_pubkeys
  2664. .append(&mut zero_lamport_single_ref_pubkeys);
  2665. if !all_are_zero_lamports {
  2666. shrink_collect.all_are_zero_lamports = false;
  2667. }
  2668. });
  2669. });
  2670. index_read_elapsed.stop();
  2671. let mut shrink_collect = shrink_collect.into_inner().unwrap();
  2672. let alive_total_bytes = shrink_collect.alive_accounts.alive_bytes();
  2673. shrink_collect.alive_total_bytes = alive_total_bytes;
  2674. stats
  2675. .index_read_elapsed
  2676. .fetch_add(index_read_elapsed.as_us(), Ordering::Relaxed);
  2677. stats.accounts_removed.fetch_add(
  2678. total_starting_accounts - shrink_collect.alive_accounts.len(),
  2679. Ordering::Relaxed,
  2680. );
  2681. stats.bytes_removed.fetch_add(
  2682. capacity.saturating_sub(alive_total_bytes as u64),
  2683. Ordering::Relaxed,
  2684. );
  2685. stats
  2686. .bytes_written
  2687. .fetch_add(alive_total_bytes as u64, Ordering::Relaxed);
  2688. shrink_collect
  2689. }
  2690. /// These accounts were found during shrink of `slot` to be slot_list=[slot] and ref_count == 1 and lamports = 0.
  2691. /// This means this slot contained the only account data for this pubkey and it is zero lamport.
  2692. /// Thus, we did NOT treat this as an alive account, so we did NOT copy the zero lamport account to the new
  2693. /// storage. So, the account will no longer be alive or exist at `slot`.
  2694. /// So, first, remove the ref count since this newly shrunk storage will no longer access it.
  2695. /// Second, remove `slot` from the index entry's slot list. If the slot list is now empty, then the
  2696. /// pubkey can be removed completely from the index.
  2697. /// In parallel with this code (which is running in the bg), the same pubkey could be revived and written to
  2698. /// as part of tx processing. In that case, the slot list will contain a slot in the write cache and the
  2699. /// index entry will NOT be deleted.
  2700. fn remove_zero_lamport_single_ref_accounts_after_shrink(
  2701. &self,
  2702. zero_lamport_single_ref_pubkeys: &[&Pubkey],
  2703. slot: Slot,
  2704. stats: &ShrinkStats,
  2705. do_assert: bool,
  2706. ) {
  2707. stats.purged_zero_lamports.fetch_add(
  2708. zero_lamport_single_ref_pubkeys.len() as u64,
  2709. Ordering::Relaxed,
  2710. );
  2711. // we have to unref before we `purge_keys_exact`. Otherwise, we could race with the foreground with tx processing
  2712. // reviving this index entry and then we'd unref the revived version, which is a refcount bug.
  2713. self.accounts_index.scan(
  2714. zero_lamport_single_ref_pubkeys.iter().cloned(),
  2715. |_pubkey, _slots_refs| AccountsIndexScanResult::Unref,
  2716. if do_assert {
  2717. Some(AccountsIndexScanResult::UnrefAssert0)
  2718. } else {
  2719. Some(AccountsIndexScanResult::UnrefLog0)
  2720. },
  2721. ScanFilter::All,
  2722. );
  2723. zero_lamport_single_ref_pubkeys.iter().for_each(|k| {
  2724. _ = self.purge_keys_exact([(**k, slot)]);
  2725. });
  2726. }
  2727. /// common code from shrink and combine_ancient_slots
  2728. /// get rid of all original store_ids in the slot
  2729. pub(crate) fn remove_old_stores_shrink<'a, T: ShrinkCollectRefs<'a>>(
  2730. &self,
  2731. shrink_collect: &ShrinkCollect<'a, T>,
  2732. stats: &ShrinkStats,
  2733. shrink_in_progress: Option<ShrinkInProgress>,
  2734. shrink_can_be_active: bool,
  2735. ) {
  2736. let mut time = Measure::start("remove_old_stores_shrink");
  2737. // handle the zero lamport alive accounts before calling clean
  2738. // We have to update the index entries for these zero lamport pubkeys before we remove the storage in `mark_dirty_dead_stores`
  2739. // that contained the accounts.
  2740. self.remove_zero_lamport_single_ref_accounts_after_shrink(
  2741. &shrink_collect.zero_lamport_single_ref_pubkeys,
  2742. shrink_collect.slot,
  2743. stats,
  2744. false,
  2745. );
  2746. // Purge old, overwritten storage entries
  2747. // This has the side effect of dropping `shrink_in_progress`, which removes the old storage completely. The
  2748. // index has to be correct before we drop the old storage.
  2749. let dead_storages = self.mark_dirty_dead_stores(
  2750. shrink_collect.slot,
  2751. // If all accounts are zero lamports, then we want to mark the entire OLD append vec as dirty.
  2752. // otherwise, we'll call 'add_uncleaned_pubkeys_after_shrink' just on the unref'd keys below.
  2753. shrink_collect.all_are_zero_lamports,
  2754. shrink_in_progress,
  2755. shrink_can_be_active,
  2756. );
  2757. let dead_storages_len = dead_storages.len();
  2758. if !shrink_collect.all_are_zero_lamports {
  2759. self.add_uncleaned_pubkeys_after_shrink(
  2760. shrink_collect.slot,
  2761. shrink_collect.pubkeys_to_unref.iter().cloned().cloned(),
  2762. );
  2763. }
  2764. let (_, drop_storage_entries_elapsed) = measure_us!(drop(dead_storages));
  2765. time.stop();
  2766. self.stats
  2767. .dropped_stores
  2768. .fetch_add(dead_storages_len as u64, Ordering::Relaxed);
  2769. stats
  2770. .drop_storage_entries_elapsed
  2771. .fetch_add(drop_storage_entries_elapsed, Ordering::Relaxed);
  2772. stats
  2773. .remove_old_stores_shrink_us
  2774. .fetch_add(time.as_us(), Ordering::Relaxed);
  2775. }
  2776. pub(crate) fn unref_shrunk_dead_accounts<'a>(
  2777. &self,
  2778. pubkeys: impl Iterator<Item = &'a Pubkey>,
  2779. slot: Slot,
  2780. ) {
  2781. self.accounts_index.scan(
  2782. pubkeys,
  2783. |pubkey, slot_refs| {
  2784. match slot_refs {
  2785. Some((slot_list, ref_count)) => {
  2786. // Let's handle the special case - after unref, the result is a single ref zero lamport account.
  2787. if slot_list.len() == 1 && ref_count == 2 {
  2788. if let Some((slot_alive, acct_info)) = slot_list.first() {
  2789. if acct_info.is_zero_lamport() && !acct_info.is_cached() {
  2790. self.zero_lamport_single_ref_found(
  2791. *slot_alive,
  2792. acct_info.offset(),
  2793. );
  2794. }
  2795. }
  2796. }
  2797. }
  2798. None => {
  2799. // We also expect that the accounts index must contain an
  2800. // entry for `pubkey`. Log a warning for now. In future,
  2801. // we will panic when this happens.
  2802. warn!(
  2803. "pubkey {pubkey} in slot {slot} was NOT found in accounts index \
  2804. during shrink"
  2805. );
  2806. datapoint_warn!(
  2807. "accounts_db-shink_pubkey_missing_from_index",
  2808. ("store_slot", slot, i64),
  2809. ("pubkey", pubkey.to_string(), String),
  2810. );
  2811. }
  2812. }
  2813. AccountsIndexScanResult::Unref
  2814. },
  2815. None,
  2816. ScanFilter::All,
  2817. );
  2818. }
  2819. /// This function handles the case when zero lamport single ref accounts are found during shrink.
  2820. pub(crate) fn zero_lamport_single_ref_found(&self, slot: Slot, offset: Offset) {
  2821. // This function can be called when a zero lamport single ref account is
  2822. // found during shrink. Therefore, we can't use the safe version of
  2823. // `get_slot_storage_entry` because shrink_in_progress map may not be
  2824. // empty. We have to use the unsafe version to avoid to assert failure.
  2825. // However, there is a possibility that the storage entry that we get is
  2826. // an old one, which is being shrunk away, because multiple slots can be
  2827. // shrunk away in parallel by thread pool. If this happens, any zero
  2828. // lamport single ref offset marked on the storage will be lost when the
  2829. // storage is dropped. However, this is not a problem, because after the
  2830. // storage being shrunk, the new storage will not have any zero lamport
  2831. // single ref account anyway. Therefore, we don't need to worry about
  2832. // marking zero lamport single ref offset on the new storage.
  2833. if let Some(store) = self
  2834. .storage
  2835. .get_slot_storage_entry_shrinking_in_progress_ok(slot)
  2836. {
  2837. if store.insert_zero_lamport_single_ref_account_offset(offset) {
  2838. // this wasn't previously marked as zero lamport single ref
  2839. self.shrink_stats
  2840. .num_zero_lamport_single_ref_accounts_found
  2841. .fetch_add(1, Ordering::Relaxed);
  2842. if store.num_zero_lamport_single_ref_accounts() == store.count() {
  2843. // all accounts in this storage can be dead
  2844. self.dirty_stores.entry(slot).or_insert(store);
  2845. self.shrink_stats
  2846. .num_dead_slots_added_to_clean
  2847. .fetch_add(1, Ordering::Relaxed);
  2848. } else if Self::is_shrinking_productive(&store)
  2849. && self.is_candidate_for_shrink(&store)
  2850. {
  2851. // this store might be eligible for shrinking now
  2852. let is_new = self.shrink_candidate_slots.lock().unwrap().insert(slot);
  2853. if is_new {
  2854. self.shrink_stats
  2855. .num_slots_with_zero_lamport_accounts_added_to_shrink
  2856. .fetch_add(1, Ordering::Relaxed);
  2857. }
  2858. } else {
  2859. self.shrink_stats
  2860. .marking_zero_dead_accounts_in_non_shrinkable_store
  2861. .fetch_add(1, Ordering::Relaxed);
  2862. }
  2863. }
  2864. }
  2865. }
  2866. /// Shrinks `store` by rewriting the alive accounts to a new storage
  2867. fn shrink_storage(&self, store: Arc<AccountStorageEntry>) {
  2868. let slot = store.slot();
  2869. if self.accounts_cache.contains(slot) {
  2870. // It is not correct to shrink a slot while it is in the write cache until flush is complete and the slot is removed from the write cache.
  2871. // There can exist a window after a slot is made a root and before the write cache flushing for that slot begins and then completes.
  2872. // There can also exist a window after a slot is being flushed from the write cache until the index is updated and the slot is removed from the write cache.
  2873. // During the second window, once an append vec has been created for the slot, it could be possible to try to shrink that slot.
  2874. // Shrink no-ops before this function if there is no store for the slot (notice this function requires 'store' to be passed).
  2875. // So, if we enter this function but the slot is still in the write cache, reasonable behavior is to skip shrinking this slot.
  2876. // Flush will ONLY write alive accounts to the append vec, which is what shrink does anyway.
  2877. // Flush then adds the slot to 'uncleaned_roots', which causes clean to take a look at the slot.
  2878. // Clean causes us to mark accounts as dead, which causes shrink to later take a look at the slot.
  2879. // This could be an assert, but it could lead to intermittency in tests.
  2880. // It is 'correct' to ignore calls to shrink when a slot is still in the write cache.
  2881. return;
  2882. }
  2883. let mut unique_accounts =
  2884. self.get_unique_accounts_from_storage_for_shrink(&store, &self.shrink_stats);
  2885. debug!("do_shrink_slot_store: slot: {slot}");
  2886. let shrink_collect = self.shrink_collect::<AliveAccounts<'_>>(
  2887. &store,
  2888. &mut unique_accounts,
  2889. &self.shrink_stats,
  2890. );
  2891. // This shouldn't happen if alive_bytes is accurate.
  2892. // However, it is possible that the remaining alive bytes could be 0. In that case, the whole slot should be marked dead by clean.
  2893. if Self::should_not_shrink(
  2894. shrink_collect.alive_total_bytes as u64,
  2895. shrink_collect.capacity,
  2896. ) || shrink_collect.alive_total_bytes == 0
  2897. {
  2898. if shrink_collect.alive_total_bytes == 0 {
  2899. // clean needs to take care of this dead slot
  2900. self.dirty_stores.insert(slot, store.clone());
  2901. }
  2902. if !shrink_collect.all_are_zero_lamports {
  2903. // if all are zero lamports, then we expect that we would like to mark the whole slot dead, but we cannot. That's clean's job.
  2904. info!(
  2905. "Unexpected shrink for slot {} alive {} capacity {}, likely caused by a bug \
  2906. for calculating alive bytes.",
  2907. slot, shrink_collect.alive_total_bytes, shrink_collect.capacity
  2908. );
  2909. }
  2910. self.shrink_stats
  2911. .skipped_shrink
  2912. .fetch_add(1, Ordering::Relaxed);
  2913. return;
  2914. }
  2915. self.unref_shrunk_dead_accounts(shrink_collect.pubkeys_to_unref.iter().cloned(), slot);
  2916. let total_accounts_after_shrink = shrink_collect.alive_accounts.len();
  2917. debug!(
  2918. "shrinking: slot: {}, accounts: ({} => {}) bytes: {} original: {}",
  2919. slot,
  2920. shrink_collect.total_starting_accounts,
  2921. total_accounts_after_shrink,
  2922. shrink_collect.alive_total_bytes,
  2923. shrink_collect.capacity,
  2924. );
  2925. let mut stats_sub = ShrinkStatsSub::default();
  2926. let mut rewrite_elapsed = Measure::start("rewrite_elapsed");
  2927. let (shrink_in_progress, time_us) =
  2928. measure_us!(self.get_store_for_shrink(slot, shrink_collect.alive_total_bytes as u64));
  2929. stats_sub.create_and_insert_store_elapsed_us = Saturating(time_us);
  2930. // here, we're writing back alive_accounts. That should be an atomic operation
  2931. // without use of rather wide locks in this whole function, because we're
  2932. // mutating rooted slots; There should be no writers to them.
  2933. let accounts = [(slot, &shrink_collect.alive_accounts.alive_accounts()[..])];
  2934. let storable_accounts = StorableAccountsBySlot::new(slot, &accounts, self);
  2935. stats_sub.store_accounts_timing = self.store_accounts_frozen(
  2936. storable_accounts,
  2937. shrink_in_progress.new_storage(),
  2938. UpdateIndexThreadSelection::PoolWithThreshold,
  2939. );
  2940. rewrite_elapsed.stop();
  2941. stats_sub.rewrite_elapsed_us = Saturating(rewrite_elapsed.as_us());
  2942. // `store_accounts_frozen()` above may have purged accounts from some
  2943. // other storage entries (the ones that were just overwritten by this
  2944. // new storage entry). This means some of those stores might have caused
  2945. // this slot to be read to `self.shrink_candidate_slots`, so delete
  2946. // those here
  2947. self.shrink_candidate_slots.lock().unwrap().remove(&slot);
  2948. self.remove_old_stores_shrink(
  2949. &shrink_collect,
  2950. &self.shrink_stats,
  2951. Some(shrink_in_progress),
  2952. false,
  2953. );
  2954. self.reopen_storage_as_readonly_shrinking_in_progress_ok(slot);
  2955. Self::update_shrink_stats(&self.shrink_stats, stats_sub, true);
  2956. self.shrink_stats.report();
  2957. }
  2958. pub(crate) fn update_shrink_stats(
  2959. shrink_stats: &ShrinkStats,
  2960. stats_sub: ShrinkStatsSub,
  2961. increment_count: bool,
  2962. ) {
  2963. if increment_count {
  2964. shrink_stats
  2965. .num_slots_shrunk
  2966. .fetch_add(1, Ordering::Relaxed);
  2967. }
  2968. shrink_stats.create_and_insert_store_elapsed.fetch_add(
  2969. stats_sub.create_and_insert_store_elapsed_us.0,
  2970. Ordering::Relaxed,
  2971. );
  2972. shrink_stats.store_accounts_elapsed.fetch_add(
  2973. stats_sub.store_accounts_timing.store_accounts_elapsed,
  2974. Ordering::Relaxed,
  2975. );
  2976. shrink_stats.update_index_elapsed.fetch_add(
  2977. stats_sub.store_accounts_timing.update_index_elapsed,
  2978. Ordering::Relaxed,
  2979. );
  2980. shrink_stats.handle_reclaims_elapsed.fetch_add(
  2981. stats_sub.store_accounts_timing.handle_reclaims_elapsed,
  2982. Ordering::Relaxed,
  2983. );
  2984. shrink_stats
  2985. .rewrite_elapsed
  2986. .fetch_add(stats_sub.rewrite_elapsed_us.0, Ordering::Relaxed);
  2987. shrink_stats
  2988. .unpackable_slots_count
  2989. .fetch_add(stats_sub.unpackable_slots_count.0 as u64, Ordering::Relaxed);
  2990. shrink_stats.newest_alive_packed_count.fetch_add(
  2991. stats_sub.newest_alive_packed_count.0 as u64,
  2992. Ordering::Relaxed,
  2993. );
  2994. }
  2995. /// get stores for 'slot'
  2996. /// Drop 'shrink_in_progress', which will cause the old store to be removed from the storage map.
  2997. /// For 'shrink_in_progress'.'old_storage' which is not retained, insert in 'dead_storages' and optionally 'dirty_stores'
  2998. /// This is the end of the life cycle of `shrink_in_progress`.
  2999. pub fn mark_dirty_dead_stores(
  3000. &self,
  3001. slot: Slot,
  3002. add_dirty_stores: bool,
  3003. shrink_in_progress: Option<ShrinkInProgress>,
  3004. shrink_can_be_active: bool,
  3005. ) -> Vec<Arc<AccountStorageEntry>> {
  3006. let mut dead_storages = Vec::default();
  3007. let mut not_retaining_store = |store: &Arc<AccountStorageEntry>| {
  3008. if add_dirty_stores {
  3009. self.dirty_stores.insert(slot, store.clone());
  3010. }
  3011. dead_storages.push(store.clone());
  3012. };
  3013. if let Some(shrink_in_progress) = shrink_in_progress {
  3014. // shrink is in progress, so 1 new append vec to keep, 1 old one to throw away
  3015. not_retaining_store(shrink_in_progress.old_storage());
  3016. // dropping 'shrink_in_progress' removes the old append vec that was being shrunk from db's storage
  3017. } else if let Some(store) = self.storage.remove(&slot, shrink_can_be_active) {
  3018. // no shrink in progress, so all append vecs in this slot are dead
  3019. not_retaining_store(&store);
  3020. }
  3021. dead_storages
  3022. }
  3023. /// we are done writing to the storage at `slot`. It can be re-opened as read-only if that would help
  3024. /// system performance.
  3025. pub(crate) fn reopen_storage_as_readonly_shrinking_in_progress_ok(&self, slot: Slot) {
  3026. if let Some(storage) = self
  3027. .storage
  3028. .get_slot_storage_entry_shrinking_in_progress_ok(slot)
  3029. {
  3030. if let Some(new_storage) = storage.reopen_as_readonly(self.storage_access) {
  3031. // consider here the race condition of tx processing having looked up something in the index,
  3032. // which could return (slot, append vec id). We want the lookup for the storage to get a storage
  3033. // that works whether the lookup occurs before or after the replace call here.
  3034. // So, the two storages have to be exactly equivalent wrt offsets, counts, len, id, etc.
  3035. assert_eq!(storage.id(), new_storage.id());
  3036. assert_eq!(storage.accounts.len(), new_storage.accounts.len());
  3037. self.storage
  3038. .replace_storage_with_equivalent(slot, Arc::new(new_storage));
  3039. }
  3040. }
  3041. }
  3042. /// return a store that can contain 'size' bytes
  3043. pub fn get_store_for_shrink(&self, slot: Slot, size: u64) -> ShrinkInProgress<'_> {
  3044. let shrunken_store = self.create_store(slot, size, "shrink", self.shrink_paths.as_slice());
  3045. self.storage.shrinking_in_progress(slot, shrunken_store)
  3046. }
  3047. // Reads all accounts in given slot's AppendVecs and filter only to alive,
  3048. // then create a minimum AppendVec filled with the alive.
  3049. fn shrink_slot_forced(&self, slot: Slot) {
  3050. debug!("shrink_slot_forced: slot: {slot}");
  3051. if let Some(store) = self
  3052. .storage
  3053. .get_slot_storage_entry_shrinking_in_progress_ok(slot)
  3054. {
  3055. if Self::is_shrinking_productive(&store) {
  3056. self.shrink_storage(store)
  3057. }
  3058. }
  3059. }
  3060. fn all_slots_in_storage(&self) -> Vec<Slot> {
  3061. self.storage.all_slots()
  3062. }
  3063. /// Given the input `ShrinkCandidates`, this function sorts the stores by their alive ratio
  3064. /// in increasing order with the most sparse entries in the front. It will then simulate the
  3065. /// shrinking by working on the most sparse entries first and if the overall alive ratio is
  3066. /// achieved, it will stop and return:
  3067. /// first tuple element: the filtered-down candidates and
  3068. /// second duple element: the candidates which
  3069. /// are skipped in this round and might be eligible for the future shrink.
  3070. fn select_candidates_by_total_usage(
  3071. &self,
  3072. shrink_slots: &ShrinkCandidates,
  3073. shrink_ratio: f64,
  3074. ) -> (IntMap<Slot, Arc<AccountStorageEntry>>, ShrinkCandidates) {
  3075. struct StoreUsageInfo {
  3076. slot: Slot,
  3077. alive_ratio: f64,
  3078. store: Arc<AccountStorageEntry>,
  3079. }
  3080. let mut store_usage: Vec<StoreUsageInfo> = Vec::with_capacity(shrink_slots.len());
  3081. let mut total_alive_bytes: u64 = 0;
  3082. let mut total_bytes: u64 = 0;
  3083. for slot in shrink_slots {
  3084. let Some(store) = self.storage.get_slot_storage_entry(*slot) else {
  3085. continue;
  3086. };
  3087. let alive_bytes = store.alive_bytes();
  3088. total_alive_bytes += alive_bytes as u64;
  3089. total_bytes += store.capacity();
  3090. let alive_ratio = alive_bytes as f64 / store.capacity() as f64;
  3091. store_usage.push(StoreUsageInfo {
  3092. slot: *slot,
  3093. alive_ratio,
  3094. store: store.clone(),
  3095. });
  3096. }
  3097. store_usage.sort_by(|a, b| {
  3098. a.alive_ratio
  3099. .partial_cmp(&b.alive_ratio)
  3100. .unwrap_or(std::cmp::Ordering::Equal)
  3101. });
  3102. // Working from the beginning of store_usage which are the most sparse and see when we can stop
  3103. // shrinking while still achieving the overall goals.
  3104. let mut shrink_slots = IntMap::default();
  3105. let mut shrink_slots_next_batch = ShrinkCandidates::default();
  3106. for usage in &store_usage {
  3107. let store = &usage.store;
  3108. let alive_ratio = (total_alive_bytes as f64) / (total_bytes as f64);
  3109. debug!(
  3110. "alive_ratio: {:?} store_id: {:?}, store_ratio: {:?} requirement: {:?}, \
  3111. total_bytes: {:?} total_alive_bytes: {:?}",
  3112. alive_ratio,
  3113. usage.store.id(),
  3114. usage.alive_ratio,
  3115. shrink_ratio,
  3116. total_bytes,
  3117. total_alive_bytes
  3118. );
  3119. if alive_ratio > shrink_ratio {
  3120. // we have reached our goal, stop
  3121. debug!(
  3122. "Shrinking goal can be achieved at slot {:?}, total_alive_bytes: {:?} \
  3123. total_bytes: {:?}, alive_ratio: {:}, shrink_ratio: {:?}",
  3124. usage.slot, total_alive_bytes, total_bytes, alive_ratio, shrink_ratio
  3125. );
  3126. if usage.alive_ratio < shrink_ratio {
  3127. shrink_slots_next_batch.insert(usage.slot);
  3128. } else {
  3129. break;
  3130. }
  3131. } else {
  3132. let current_store_size = store.capacity();
  3133. let after_shrink_size = store.alive_bytes() as u64;
  3134. let bytes_saved = current_store_size.saturating_sub(after_shrink_size);
  3135. total_bytes -= bytes_saved;
  3136. shrink_slots.insert(usage.slot, Arc::clone(store));
  3137. }
  3138. }
  3139. (shrink_slots, shrink_slots_next_batch)
  3140. }
  3141. fn get_roots_less_than(&self, slot: Slot) -> Vec<Slot> {
  3142. self.accounts_index
  3143. .roots_tracker
  3144. .read()
  3145. .unwrap()
  3146. .alive_roots
  3147. .get_all_less_than(slot)
  3148. }
  3149. /// return all slots that are more than one epoch old and thus could already be an ancient append vec
  3150. /// or which could need to be combined into a new or existing ancient append vec
  3151. /// offset is used to combine newer slots than we normally would. This is designed to be used for testing.
  3152. fn get_sorted_potential_ancient_slots(&self, oldest_non_ancient_slot: Slot) -> Vec<Slot> {
  3153. let mut ancient_slots = self.get_roots_less_than(oldest_non_ancient_slot);
  3154. ancient_slots.sort_unstable();
  3155. ancient_slots
  3156. }
  3157. /// get a sorted list of slots older than an epoch
  3158. /// squash those slots into ancient append vecs
  3159. pub fn shrink_ancient_slots(&self, epoch_schedule: &EpochSchedule) {
  3160. if self.ancient_append_vec_offset.is_none() {
  3161. return;
  3162. }
  3163. let oldest_non_ancient_slot = self.get_oldest_non_ancient_slot(epoch_schedule);
  3164. let can_randomly_shrink = true;
  3165. let sorted_slots = self.get_sorted_potential_ancient_slots(oldest_non_ancient_slot);
  3166. self.combine_ancient_slots_packed(sorted_slots, can_randomly_shrink);
  3167. }
  3168. /// each slot in 'dropped_roots' has been combined into an ancient append vec.
  3169. /// We are done with the slot now forever.
  3170. pub(crate) fn handle_dropped_roots_for_ancient(
  3171. &self,
  3172. dropped_roots: impl Iterator<Item = Slot>,
  3173. ) {
  3174. dropped_roots.for_each(|slot| {
  3175. self.accounts_index.clean_dead_slot(slot);
  3176. // the storage has been removed from this slot and recycled or dropped
  3177. assert!(self.storage.remove(&slot, false).is_none());
  3178. debug_assert!(
  3179. !self
  3180. .accounts_index
  3181. .roots_tracker
  3182. .read()
  3183. .unwrap()
  3184. .alive_roots
  3185. .contains(&slot),
  3186. "slot: {slot}"
  3187. );
  3188. });
  3189. }
  3190. /// add all 'pubkeys' into the set of pubkeys that are 'uncleaned', associated with 'slot'
  3191. /// clean will visit these pubkeys next time it runs
  3192. fn add_uncleaned_pubkeys_after_shrink(
  3193. &self,
  3194. slot: Slot,
  3195. pubkeys: impl Iterator<Item = Pubkey>,
  3196. ) {
  3197. /*
  3198. This is only called during 'shrink'-type operations.
  3199. Original accounts were separated into 'accounts' and 'pubkeys_to_unref'.
  3200. These sets correspond to 'alive' and 'dead'.
  3201. 'alive' means this account in this slot is in the accounts index.
  3202. 'dead' means this account in this slot is NOT in the accounts index.
  3203. If dead, nobody will care if this version of this account is not written into the newly shrunk append vec for this slot.
  3204. For all dead accounts, they were already unrefed and are now absent in the new append vec.
  3205. This means that another version of this pubkey could possibly now be cleaned since this one is now gone.
  3206. For example, a zero lamport account in a later slot can be removed if we just removed the only non-zero lamport account for that pubkey in this slot.
  3207. So, for all unrefed accounts, send them to clean to be revisited next time clean runs.
  3208. If an account is alive, then its status has not changed. It was previously alive in this slot. It is still alive in this slot.
  3209. Clean doesn't care about alive accounts that remain alive.
  3210. Except... A slightly different case is if ALL the alive accounts in this slot are zero lamport accounts, then it is possible that
  3211. this slot can be marked dead. So, if all alive accounts are zero lamports, we send the entire OLD/pre-shrunk append vec
  3212. to clean so that all the pubkeys are visited.
  3213. It is a performance optimization to not send the ENTIRE old/pre-shrunk append vec to clean in the normal case.
  3214. */
  3215. let mut uncleaned_pubkeys = self.uncleaned_pubkeys.entry(slot).or_default();
  3216. uncleaned_pubkeys.extend(pubkeys);
  3217. }
  3218. pub fn shrink_candidate_slots(&self, epoch_schedule: &EpochSchedule) -> usize {
  3219. let oldest_non_ancient_slot = self.get_oldest_non_ancient_slot(epoch_schedule);
  3220. let shrink_candidates_slots =
  3221. std::mem::take(&mut *self.shrink_candidate_slots.lock().unwrap());
  3222. self.shrink_stats
  3223. .initial_candidates_count
  3224. .store(shrink_candidates_slots.len() as u64, Ordering::Relaxed);
  3225. let candidates_count = shrink_candidates_slots.len();
  3226. let ((mut shrink_slots, shrink_slots_next_batch), select_time_us) = measure_us!({
  3227. if let AccountShrinkThreshold::TotalSpace { shrink_ratio } = self.shrink_ratio {
  3228. let (shrink_slots, shrink_slots_next_batch) =
  3229. self.select_candidates_by_total_usage(&shrink_candidates_slots, shrink_ratio);
  3230. (shrink_slots, Some(shrink_slots_next_batch))
  3231. } else {
  3232. (
  3233. // lookup storage for each slot
  3234. shrink_candidates_slots
  3235. .into_iter()
  3236. .filter_map(|slot| {
  3237. self.storage
  3238. .get_slot_storage_entry(slot)
  3239. .map(|storage| (slot, storage))
  3240. })
  3241. .collect(),
  3242. None,
  3243. )
  3244. }
  3245. });
  3246. // If there are too few slots to shrink, add an ancient slot
  3247. // for shrinking.
  3248. if shrink_slots.len() < SHRINK_INSERT_ANCIENT_THRESHOLD {
  3249. let mut ancients = self.best_ancient_slots_to_shrink.write().unwrap();
  3250. while let Some((slot, capacity)) = ancients.pop_front() {
  3251. if let Some(store) = self.storage.get_slot_storage_entry(slot) {
  3252. if !shrink_slots.contains(&slot)
  3253. && capacity == store.capacity()
  3254. && Self::is_candidate_for_shrink(self, &store)
  3255. {
  3256. let ancient_bytes_added_to_shrink = store.alive_bytes() as u64;
  3257. shrink_slots.insert(slot, store);
  3258. self.shrink_stats
  3259. .ancient_bytes_added_to_shrink
  3260. .fetch_add(ancient_bytes_added_to_shrink, Ordering::Relaxed);
  3261. self.shrink_stats
  3262. .ancient_slots_added_to_shrink
  3263. .fetch_add(1, Ordering::Relaxed);
  3264. break;
  3265. }
  3266. }
  3267. }
  3268. }
  3269. if shrink_slots.is_empty()
  3270. && shrink_slots_next_batch
  3271. .as_ref()
  3272. .map(|s| s.is_empty())
  3273. .unwrap_or(true)
  3274. {
  3275. return 0;
  3276. }
  3277. let _guard = (!shrink_slots.is_empty())
  3278. .then_some(|| self.active_stats.activate(ActiveStatItem::Shrink));
  3279. let num_selected = shrink_slots.len();
  3280. let (_, shrink_all_us) = measure_us!({
  3281. self.thread_pool_background.install(|| {
  3282. shrink_slots
  3283. .into_par_iter()
  3284. .for_each(|(slot, slot_shrink_candidate)| {
  3285. if self.ancient_append_vec_offset.is_some()
  3286. && slot < oldest_non_ancient_slot
  3287. {
  3288. self.shrink_stats
  3289. .num_ancient_slots_shrunk
  3290. .fetch_add(1, Ordering::Relaxed);
  3291. }
  3292. self.shrink_storage(slot_shrink_candidate);
  3293. });
  3294. })
  3295. });
  3296. let mut pended_counts: usize = 0;
  3297. if let Some(shrink_slots_next_batch) = shrink_slots_next_batch {
  3298. let mut shrink_slots = self.shrink_candidate_slots.lock().unwrap();
  3299. pended_counts = shrink_slots_next_batch.len();
  3300. for slot in shrink_slots_next_batch {
  3301. shrink_slots.insert(slot);
  3302. }
  3303. }
  3304. datapoint_info!(
  3305. "shrink_candidate_slots",
  3306. ("select_time_us", select_time_us, i64),
  3307. ("shrink_all_us", shrink_all_us, i64),
  3308. ("candidates_count", candidates_count, i64),
  3309. ("selected_count", num_selected, i64),
  3310. ("deferred_to_next_round_count", pended_counts, i64)
  3311. );
  3312. num_selected
  3313. }
  3314. /// This is only called at startup from bank when we are being extra careful such as when we downloaded a snapshot.
  3315. /// Also called from tests.
  3316. /// `newest_slot_skip_shrink_inclusive` is used to avoid shrinking the slot we are loading a snapshot from. If we shrink that slot, we affect
  3317. /// the bank hash calculation verification at startup.
  3318. pub fn shrink_all_slots(
  3319. &self,
  3320. is_startup: bool,
  3321. epoch_schedule: &EpochSchedule,
  3322. newest_slot_skip_shrink_inclusive: Option<Slot>,
  3323. ) {
  3324. let _guard = self.active_stats.activate(ActiveStatItem::Shrink);
  3325. const DIRTY_STORES_CLEANING_THRESHOLD: usize = 10_000;
  3326. const OUTER_CHUNK_SIZE: usize = 2000;
  3327. let mut slots = self.all_slots_in_storage();
  3328. if let Some(newest_slot_skip_shrink_inclusive) = newest_slot_skip_shrink_inclusive {
  3329. // at startup, we cannot shrink the slot that we're about to replay and recalculate bank hash for.
  3330. // That storage's contents are used to verify the bank hash (and accounts delta hash) of the startup slot.
  3331. slots.retain(|slot| slot < &newest_slot_skip_shrink_inclusive);
  3332. }
  3333. // if we are restoring from incremental + full snapshot, then we cannot clean past latest_full_snapshot_slot.
  3334. // If we were to clean past that, then we could mark accounts prior to latest_full_snapshot_slot as dead.
  3335. // If we mark accounts prior to latest_full_snapshot_slot as dead, then we could shrink those accounts away.
  3336. // If we shrink accounts away, then when we run the full hash of all accounts calculation up to latest_full_snapshot_slot,
  3337. // then we will get the wrong answer, because some accounts may be GONE from the slot range up to latest_full_snapshot_slot.
  3338. // So, we can only clean UP TO and including latest_full_snapshot_slot.
  3339. // As long as we don't mark anything as dead at slots > latest_full_snapshot_slot, then shrink will have nothing to do for
  3340. // slots > latest_full_snapshot_slot.
  3341. let maybe_clean = || {
  3342. if self.dirty_stores.len() > DIRTY_STORES_CLEANING_THRESHOLD {
  3343. let latest_full_snapshot_slot = self.latest_full_snapshot_slot();
  3344. self.clean_accounts(latest_full_snapshot_slot, is_startup, epoch_schedule);
  3345. }
  3346. };
  3347. if is_startup {
  3348. let threads = num_cpus::get();
  3349. let inner_chunk_size = std::cmp::max(OUTER_CHUNK_SIZE / threads, 1);
  3350. slots.chunks(OUTER_CHUNK_SIZE).for_each(|chunk| {
  3351. chunk.par_chunks(inner_chunk_size).for_each(|slots| {
  3352. for slot in slots {
  3353. self.shrink_slot_forced(*slot);
  3354. }
  3355. });
  3356. maybe_clean();
  3357. });
  3358. } else {
  3359. for slot in slots {
  3360. self.shrink_slot_forced(slot);
  3361. maybe_clean();
  3362. }
  3363. }
  3364. }
  3365. pub fn scan_accounts<F>(
  3366. &self,
  3367. ancestors: &Ancestors,
  3368. bank_id: BankId,
  3369. mut scan_func: F,
  3370. config: &ScanConfig,
  3371. ) -> ScanResult<()>
  3372. where
  3373. F: FnMut(Option<(&Pubkey, AccountSharedData, Slot)>),
  3374. {
  3375. // This can error out if the slots being scanned over are aborted
  3376. self.accounts_index.scan_accounts(
  3377. ancestors,
  3378. bank_id,
  3379. |pubkey, (account_info, slot)| {
  3380. let mut account_accessor =
  3381. self.get_account_accessor(slot, pubkey, &account_info.storage_location());
  3382. let account_slot = match account_accessor {
  3383. LoadedAccountAccessor::Cached(None) => None,
  3384. _ => account_accessor.get_loaded_account(|loaded_account| {
  3385. (pubkey, loaded_account.take_account(), slot)
  3386. }),
  3387. };
  3388. scan_func(account_slot)
  3389. },
  3390. config,
  3391. )?;
  3392. Ok(())
  3393. }
  3394. pub fn index_scan_accounts<F>(
  3395. &self,
  3396. ancestors: &Ancestors,
  3397. bank_id: BankId,
  3398. index_key: IndexKey,
  3399. mut scan_func: F,
  3400. config: &ScanConfig,
  3401. ) -> ScanResult<bool>
  3402. where
  3403. F: FnMut(Option<(&Pubkey, AccountSharedData, Slot)>),
  3404. {
  3405. let key = match &index_key {
  3406. IndexKey::ProgramId(key) => key,
  3407. IndexKey::SplTokenMint(key) => key,
  3408. IndexKey::SplTokenOwner(key) => key,
  3409. };
  3410. if !self.account_indexes.include_key(key) {
  3411. // the requested key was not indexed in the secondary index, so do a normal scan
  3412. let used_index = false;
  3413. self.scan_accounts(ancestors, bank_id, scan_func, config)?;
  3414. return Ok(used_index);
  3415. }
  3416. self.accounts_index.index_scan_accounts(
  3417. ancestors,
  3418. bank_id,
  3419. index_key,
  3420. |pubkey, (account_info, slot)| {
  3421. let account_slot = self
  3422. .get_account_accessor(slot, pubkey, &account_info.storage_location())
  3423. .get_loaded_account(|loaded_account| {
  3424. (pubkey, loaded_account.take_account(), slot)
  3425. });
  3426. scan_func(account_slot)
  3427. },
  3428. config,
  3429. )?;
  3430. let used_index = true;
  3431. Ok(used_index)
  3432. }
  3433. /// Scan a specific slot through all the account storage
  3434. pub(crate) fn scan_account_storage<R, B>(
  3435. &self,
  3436. slot: Slot,
  3437. cache_map_func: impl Fn(&LoadedAccount) -> Option<R> + Sync,
  3438. storage_scan_func: impl for<'a, 'b, 'storage> Fn(
  3439. &'b mut B,
  3440. &'a StoredAccountInfoWithoutData<'storage>,
  3441. Option<&'storage [u8]>, // account data
  3442. ) + Sync,
  3443. scan_account_storage_data: ScanAccountStorageData,
  3444. ) -> ScanStorageResult<R, B>
  3445. where
  3446. R: Send,
  3447. B: Send + Default + Sync,
  3448. {
  3449. self.scan_cache_storage_fallback(slot, cache_map_func, |retval, storage| {
  3450. match scan_account_storage_data {
  3451. ScanAccountStorageData::NoData => {
  3452. storage.scan_accounts_without_data(|_offset, account_without_data| {
  3453. storage_scan_func(retval, &account_without_data, None);
  3454. })
  3455. }
  3456. ScanAccountStorageData::DataRefForStorage => {
  3457. let mut reader = append_vec::new_scan_accounts_reader();
  3458. storage.scan_accounts(&mut reader, |_offset, account| {
  3459. let account_without_data = StoredAccountInfoWithoutData::new_from(&account);
  3460. storage_scan_func(retval, &account_without_data, Some(account.data));
  3461. })
  3462. }
  3463. }
  3464. .expect("must scan accounts storage");
  3465. })
  3466. }
  3467. /// Scan the cache with a fallback to storage for a specific slot.
  3468. pub fn scan_cache_storage_fallback<R, B>(
  3469. &self,
  3470. slot: Slot,
  3471. cache_map_func: impl Fn(&LoadedAccount) -> Option<R> + Sync,
  3472. storage_fallback_func: impl Fn(&mut B, &AccountsFile) + Sync,
  3473. ) -> ScanStorageResult<R, B>
  3474. where
  3475. R: Send,
  3476. B: Send + Default + Sync,
  3477. {
  3478. if let Some(slot_cache) = self.accounts_cache.slot_cache(slot) {
  3479. // If we see the slot in the cache, then all the account information
  3480. // is in this cached slot
  3481. if slot_cache.len() > SCAN_SLOT_PAR_ITER_THRESHOLD {
  3482. ScanStorageResult::Cached(self.thread_pool_foreground.install(|| {
  3483. slot_cache
  3484. .par_iter()
  3485. .filter_map(|cached_account| {
  3486. cache_map_func(&LoadedAccount::Cached(Cow::Borrowed(
  3487. cached_account.value(),
  3488. )))
  3489. })
  3490. .collect()
  3491. }))
  3492. } else {
  3493. ScanStorageResult::Cached(
  3494. slot_cache
  3495. .iter()
  3496. .filter_map(|cached_account| {
  3497. cache_map_func(&LoadedAccount::Cached(Cow::Borrowed(
  3498. cached_account.value(),
  3499. )))
  3500. })
  3501. .collect(),
  3502. )
  3503. }
  3504. } else {
  3505. let mut retval = B::default();
  3506. // If the slot is not in the cache, then all the account information must have
  3507. // been flushed. This is guaranteed because we only remove the rooted slot from
  3508. // the cache *after* we've finished flushing in `flush_slot_cache`.
  3509. // Regarding `shrinking_in_progress_ok`:
  3510. // This fn could be running in the foreground, so shrinking could be running in the background, independently.
  3511. // Even if shrinking is running, there will be 0-1 active storages to scan here at any point.
  3512. // When a concurrent shrink completes, the active storage at this slot will
  3513. // be replaced with an equivalent storage with only alive accounts in it.
  3514. // A shrink on this slot could have completed anytime before the call here, a shrink could currently be in progress,
  3515. // or the shrink could complete immediately or anytime after this call. This has always been true.
  3516. // So, whether we get a never-shrunk, an about-to-be shrunk, or a will-be-shrunk-in-future storage here to scan,
  3517. // all are correct and possible in a normally running system.
  3518. if let Some(storage) = self
  3519. .storage
  3520. .get_slot_storage_entry_shrinking_in_progress_ok(slot)
  3521. {
  3522. storage_fallback_func(&mut retval, &storage.accounts);
  3523. }
  3524. ScanStorageResult::Stored(retval)
  3525. }
  3526. }
  3527. pub fn load(
  3528. &self,
  3529. ancestors: &Ancestors,
  3530. pubkey: &Pubkey,
  3531. load_hint: LoadHint,
  3532. ) -> Option<(AccountSharedData, Slot)> {
  3533. self.do_load(ancestors, pubkey, None, load_hint, LoadZeroLamports::None)
  3534. }
  3535. /// load the account with `pubkey` into the read only accounts cache.
  3536. /// The goal is to make subsequent loads (which caller expects to occur) to find the account quickly.
  3537. pub fn load_account_into_read_cache(&self, ancestors: &Ancestors, pubkey: &Pubkey) {
  3538. self.do_load_with_populate_read_cache(
  3539. ancestors,
  3540. pubkey,
  3541. None,
  3542. LoadHint::Unspecified,
  3543. true,
  3544. // no return from this function, so irrelevant
  3545. LoadZeroLamports::None,
  3546. );
  3547. }
  3548. /// note this returns None for accounts with zero lamports
  3549. pub fn load_with_fixed_root(
  3550. &self,
  3551. ancestors: &Ancestors,
  3552. pubkey: &Pubkey,
  3553. ) -> Option<(AccountSharedData, Slot)> {
  3554. self.load(ancestors, pubkey, LoadHint::FixedMaxRoot)
  3555. }
  3556. fn read_index_for_accessor_or_load_slow<'a>(
  3557. &'a self,
  3558. ancestors: &Ancestors,
  3559. pubkey: &'a Pubkey,
  3560. max_root: Option<Slot>,
  3561. clone_in_lock: bool,
  3562. ) -> Option<(Slot, StorageLocation, Option<LoadedAccountAccessor<'a>>)> {
  3563. self.accounts_index.get_with_and_then(
  3564. pubkey,
  3565. Some(ancestors),
  3566. max_root,
  3567. true,
  3568. |(slot, account_info)| {
  3569. let storage_location = account_info.storage_location();
  3570. let account_accessor = clone_in_lock
  3571. .then(|| self.get_account_accessor(slot, pubkey, &storage_location));
  3572. (slot, storage_location, account_accessor)
  3573. },
  3574. )
  3575. }
  3576. fn retry_to_get_account_accessor<'a>(
  3577. &'a self,
  3578. mut slot: Slot,
  3579. mut storage_location: StorageLocation,
  3580. ancestors: &'a Ancestors,
  3581. pubkey: &'a Pubkey,
  3582. max_root: Option<Slot>,
  3583. load_hint: LoadHint,
  3584. ) -> Option<(LoadedAccountAccessor<'a>, Slot)> {
  3585. // Happy drawing time! :)
  3586. //
  3587. // Reader | Accessed data source for cached/stored
  3588. // -------------------------------------+----------------------------------
  3589. // R1 read_index_for_accessor_or_load_slow()| cached/stored: index
  3590. // | |
  3591. // <(store_id, offset, ..)> |
  3592. // V |
  3593. // R2 retry_to_get_account_accessor()/ | cached: map of caches & entry for (slot, pubkey)
  3594. // get_account_accessor() | stored: map of stores
  3595. // | |
  3596. // <Accessor> |
  3597. // V |
  3598. // R3 check_and_get_loaded_account()/ | cached: N/A (note: basically noop unwrap)
  3599. // get_loaded_account() | stored: store's entry for slot
  3600. // | |
  3601. // <LoadedAccount> |
  3602. // V |
  3603. // R4 take_account() | cached/stored: entry of cache/storage for (slot, pubkey)
  3604. // | |
  3605. // <AccountSharedData> |
  3606. // V |
  3607. // Account!! V
  3608. //
  3609. // Flusher | Accessed data source for cached/stored
  3610. // -------------------------------------+----------------------------------
  3611. // F1 flush_slot_cache() | N/A
  3612. // | |
  3613. // V |
  3614. // F2 store_accounts_frozen()/ | map of stores (creates new entry)
  3615. // write_accounts_to_storage() |
  3616. // | |
  3617. // V |
  3618. // F3 store_accounts_frozen()/ | index
  3619. // update_index() | (replaces existing store_id, offset in caches)
  3620. // | |
  3621. // V |
  3622. // F4 accounts_cache.remove_slot() | map of caches (removes old entry)
  3623. // V
  3624. //
  3625. // Remarks for flusher: So, for any reading operations, it's a race condition where F4 happens
  3626. // between R1 and R2. In that case, retrying from R1 is safu because F3 should have
  3627. // been occurred.
  3628. //
  3629. // Shrinker | Accessed data source for stored
  3630. // -------------------------------------+----------------------------------
  3631. // S1 do_shrink_slot_store() | N/A
  3632. // | |
  3633. // V |
  3634. // S2 store_accounts_frozen()/ | map of stores (creates new entry)
  3635. // write_accounts_to_storage() |
  3636. // | |
  3637. // V |
  3638. // S3 store_accounts_frozen()/ | index
  3639. // update_index() | (replaces existing store_id, offset in stores)
  3640. // | |
  3641. // V |
  3642. // S4 do_shrink_slot_store()/ | map of stores (removes old entry)
  3643. // dead_storages
  3644. //
  3645. // Remarks for shrinker: So, for any reading operations, it's a race condition
  3646. // where S4 happens between R1 and R2. In that case, retrying from R1 is safu because S3 should have
  3647. // been occurred, and S3 atomically replaced the index accordingly.
  3648. //
  3649. // Cleaner | Accessed data source for stored
  3650. // -------------------------------------+----------------------------------
  3651. // C1 clean_accounts() | N/A
  3652. // | |
  3653. // V |
  3654. // C2 clean_accounts()/ | index
  3655. // purge_keys_exact() | (removes existing store_id, offset for stores)
  3656. // | |
  3657. // V |
  3658. // C3 clean_accounts()/ | map of stores (removes old entry)
  3659. // handle_reclaims() |
  3660. //
  3661. // Remarks for cleaner: So, for any reading operations, it's a race condition
  3662. // where C3 happens between R1 and R2. In that case, retrying from R1 is safu.
  3663. // In that case, None would be returned while bailing out at R1.
  3664. //
  3665. // Purger | Accessed data source for cached/stored
  3666. // ---------------------------------------+----------------------------------
  3667. // P1 purge_slot() | N/A
  3668. // | |
  3669. // V |
  3670. // P2 purge_slots_from_cache_and_store() | map of caches/stores (removes old entry)
  3671. // | |
  3672. // V |
  3673. // P3 purge_slots_from_cache_and_store()/ | index
  3674. // purge_slot_cache()/ |
  3675. // purge_slot_cache_pubkeys() | (removes existing store_id, offset for cache)
  3676. // purge_slot_storage()/ |
  3677. // purge_keys_exact() | (removes accounts index entries)
  3678. // handle_reclaims() | (removes storage entries)
  3679. // OR |
  3680. // clean_accounts()/ |
  3681. // clean_accounts_older_than_root()| (removes existing store_id, offset for stores)
  3682. // V
  3683. //
  3684. // Remarks for purger: So, for any reading operations, it's a race condition
  3685. // where P2 happens between R1 and R2. In that case, retrying from R1 is safu.
  3686. // In that case, we may bail at index read retry when P3 hasn't been run
  3687. #[cfg(test)]
  3688. {
  3689. // Give some time for cache flushing to occur here for unit tests
  3690. sleep(Duration::from_millis(self.load_delay));
  3691. }
  3692. // Failsafe for potential race conditions with other subsystems
  3693. let mut num_acceptable_failed_iterations = 0;
  3694. loop {
  3695. let account_accessor = self.get_account_accessor(slot, pubkey, &storage_location);
  3696. match account_accessor {
  3697. LoadedAccountAccessor::Cached(Some(_)) | LoadedAccountAccessor::Stored(Some(_)) => {
  3698. // Great! There was no race, just return :) This is the most usual situation
  3699. return Some((account_accessor, slot));
  3700. }
  3701. LoadedAccountAccessor::Cached(None) => {
  3702. num_acceptable_failed_iterations += 1;
  3703. // Cache was flushed in between checking the index and retrieving from the cache,
  3704. // so retry. This works because in accounts cache flush, an account is written to
  3705. // storage *before* it is removed from the cache
  3706. match load_hint {
  3707. LoadHint::FixedMaxRootDoNotPopulateReadCache | LoadHint::FixedMaxRoot => {
  3708. // it's impossible for this to fail for transaction loads from
  3709. // replaying/banking more than once.
  3710. // This is because:
  3711. // 1) For a slot `X` that's being replayed, there is only one
  3712. // latest ancestor containing the latest update for the account, and this
  3713. // ancestor can only be flushed once.
  3714. // 2) The root cannot move while replaying, so the index cannot continually
  3715. // find more up to date entries than the current `slot`
  3716. assert!(num_acceptable_failed_iterations <= 1);
  3717. }
  3718. LoadHint::Unspecified => {
  3719. // Because newer root can be added to the index (= not fixed),
  3720. // multiple flush race conditions can be observed under very rare
  3721. // condition, at least theoretically
  3722. }
  3723. }
  3724. }
  3725. LoadedAccountAccessor::Stored(None) => {
  3726. match load_hint {
  3727. LoadHint::FixedMaxRootDoNotPopulateReadCache | LoadHint::FixedMaxRoot => {
  3728. // When running replay on the validator, or banking stage on the leader,
  3729. // it should be very rare that the storage entry doesn't exist if the
  3730. // entry in the accounts index is the latest version of this account.
  3731. //
  3732. // There are only a few places where the storage entry may not exist
  3733. // after reading the index:
  3734. // 1) Shrink has removed the old storage entry and rewritten to
  3735. // a newer storage entry
  3736. // 2) The `pubkey` asked for in this function is a zero-lamport account,
  3737. // and the storage entry holding this account qualified for zero-lamport clean.
  3738. //
  3739. // In both these cases, it should be safe to retry and recheck the accounts
  3740. // index indefinitely, without incrementing num_acceptable_failed_iterations.
  3741. // That's because if the root is fixed, there should be a bounded number
  3742. // of pending cleans/shrinks (depends how far behind the AccountsBackgroundService
  3743. // is), termination to the desired condition is guaranteed.
  3744. //
  3745. // Also note that in both cases, if we do find the storage entry,
  3746. // we can guarantee that the storage entry is safe to read from because
  3747. // we grabbed a reference to the storage entry while it was still in the
  3748. // storage map. This means even if the storage entry is removed from the storage
  3749. // map after we grabbed the storage entry, the recycler should not reset the
  3750. // storage entry until we drop the reference to the storage entry.
  3751. //
  3752. // eh, no code in this arm? yes!
  3753. }
  3754. LoadHint::Unspecified => {
  3755. // RPC get_account() may have fetched an old root from the index that was
  3756. // either:
  3757. // 1) Cleaned up by clean_accounts(), so the accounts index has been updated
  3758. // and the storage entries have been removed.
  3759. // 2) Dropped by purge_slots() because the slot was on a minor fork, which
  3760. // removes the slots' storage entries but doesn't purge from the accounts index
  3761. // (account index cleanup is left to clean for stored slots). Note that
  3762. // this generally is impossible to occur in the wild because the RPC
  3763. // should hold the slot's bank, preventing it from being purged() to
  3764. // begin with.
  3765. num_acceptable_failed_iterations += 1;
  3766. }
  3767. }
  3768. }
  3769. }
  3770. #[cfg(not(test))]
  3771. let load_limit = ABSURD_CONSECUTIVE_FAILED_ITERATIONS;
  3772. #[cfg(test)]
  3773. let load_limit = self.load_limit.load(Ordering::Relaxed);
  3774. let fallback_to_slow_path = if num_acceptable_failed_iterations >= load_limit {
  3775. // The latest version of the account existed in the index, but could not be
  3776. // fetched from storage. This means a race occurred between this function and clean
  3777. // accounts/purge_slots
  3778. let message = format!(
  3779. "do_load() failed to get key: {pubkey} from storage, latest attempt was for \
  3780. slot: {slot}, storage_location: {storage_location:?}, load_hint: \
  3781. {load_hint:?}",
  3782. );
  3783. datapoint_warn!("accounts_db-do_load_warn", ("warn", message, String));
  3784. true
  3785. } else {
  3786. false
  3787. };
  3788. // Because reading from the cache/storage failed, retry from the index read
  3789. let (new_slot, new_storage_location, maybe_account_accessor) = self
  3790. .read_index_for_accessor_or_load_slow(
  3791. ancestors,
  3792. pubkey,
  3793. max_root,
  3794. fallback_to_slow_path,
  3795. )?;
  3796. // Notice the subtle `?` at previous line, we bail out pretty early if missing.
  3797. if new_slot == slot && new_storage_location.is_store_id_equal(&storage_location) {
  3798. self.accounts_index
  3799. .get_and_then(pubkey, |entry| -> (_, ()) {
  3800. let message = format!(
  3801. "Bad index entry detected ({pubkey}, {slot}, {storage_location:?}, \
  3802. {load_hint:?}, {new_storage_location:?}, {entry:?})"
  3803. );
  3804. // Considering that we've failed to get accessor above and further that
  3805. // the index still returned the same (slot, store_id) tuple, offset must be same
  3806. // too.
  3807. assert!(
  3808. new_storage_location.is_offset_equal(&storage_location),
  3809. "{message}"
  3810. );
  3811. // If the entry was missing from the cache, that means it must have been flushed,
  3812. // and the accounts index is always updated before cache flush, so store_id must
  3813. // not indicate being cached at this point.
  3814. assert!(!new_storage_location.is_cached(), "{message}");
  3815. // If this is not a cache entry, then this was a minor fork slot
  3816. // that had its storage entries cleaned up by purge_slots() but hasn't been
  3817. // cleaned yet. That means this must be rpc access and not replay/banking at the
  3818. // very least. Note that purge shouldn't occur even for RPC as caller must hold all
  3819. // of ancestor slots..
  3820. assert_eq!(load_hint, LoadHint::Unspecified, "{message}");
  3821. // Everything being assert!()-ed, let's panic!() here as it's an error condition
  3822. // after all....
  3823. // That reasoning is based on the fact all of code-path reaching this fn
  3824. // retry_to_get_account_accessor() must outlive the Arc<Bank> (and its all
  3825. // ancestors) over this fn invocation, guaranteeing the prevention of being purged,
  3826. // first of all.
  3827. // For details, see the comment in AccountIndex::do_checked_scan_accounts(),
  3828. // which is referring back here.
  3829. panic!("{message}");
  3830. });
  3831. } else if fallback_to_slow_path {
  3832. // the above bad-index-entry check must had been checked first to retain the same
  3833. // behavior
  3834. return Some((
  3835. maybe_account_accessor.expect("must be some if clone_in_lock=true"),
  3836. new_slot,
  3837. ));
  3838. }
  3839. slot = new_slot;
  3840. storage_location = new_storage_location;
  3841. }
  3842. }
  3843. fn do_load(
  3844. &self,
  3845. ancestors: &Ancestors,
  3846. pubkey: &Pubkey,
  3847. max_root: Option<Slot>,
  3848. load_hint: LoadHint,
  3849. load_zero_lamports: LoadZeroLamports,
  3850. ) -> Option<(AccountSharedData, Slot)> {
  3851. self.do_load_with_populate_read_cache(
  3852. ancestors,
  3853. pubkey,
  3854. max_root,
  3855. load_hint,
  3856. false,
  3857. load_zero_lamports,
  3858. )
  3859. }
  3860. /// Load account with `pubkey` and maybe put into read cache.
  3861. ///
  3862. /// Return the account and the slot when the account was last stored.
  3863. /// Return None for ZeroLamport accounts.
  3864. pub fn load_account_with(
  3865. &self,
  3866. ancestors: &Ancestors,
  3867. pubkey: &Pubkey,
  3868. should_put_in_read_cache: bool,
  3869. ) -> Option<(AccountSharedData, Slot)> {
  3870. let (slot, storage_location, _maybe_account_accessor) =
  3871. self.read_index_for_accessor_or_load_slow(ancestors, pubkey, None, false)?;
  3872. // Notice the subtle `?` at previous line, we bail out pretty early if missing.
  3873. let in_write_cache = storage_location.is_cached();
  3874. if !in_write_cache {
  3875. let result = self.read_only_accounts_cache.load(*pubkey, slot);
  3876. if let Some(account) = result {
  3877. if account.is_zero_lamport() {
  3878. return None;
  3879. }
  3880. return Some((account, slot));
  3881. }
  3882. }
  3883. let (mut account_accessor, slot) = self.retry_to_get_account_accessor(
  3884. slot,
  3885. storage_location,
  3886. ancestors,
  3887. pubkey,
  3888. None,
  3889. LoadHint::Unspecified,
  3890. )?;
  3891. // note that the account being in the cache could be different now than it was previously
  3892. // since the cache could be flushed in between the 2 calls.
  3893. let in_write_cache = matches!(account_accessor, LoadedAccountAccessor::Cached(_));
  3894. let account = account_accessor.check_and_get_loaded_account_shared_data();
  3895. if account.is_zero_lamport() {
  3896. return None;
  3897. }
  3898. if !in_write_cache && should_put_in_read_cache {
  3899. /*
  3900. We show this store into the read-only cache for account 'A' and future loads of 'A' from the read-only cache are
  3901. safe/reflect 'A''s latest state on this fork.
  3902. This safety holds if during replay of slot 'S', we show we only read 'A' from the write cache,
  3903. not the read-only cache, after it's been updated in replay of slot 'S'.
  3904. Assume for contradiction this is not true, and we read 'A' from the read-only cache *after* it had been updated in 'S'.
  3905. This means an entry '(S, A)' was added to the read-only cache after 'A' had been updated in 'S'.
  3906. Now when '(S, A)' was being added to the read-only cache, it must have been true that 'is_cache == false',
  3907. which means '(S', A)' does not exist in the write cache yet.
  3908. However, by the assumption for contradiction above , 'A' has already been updated in 'S' which means '(S, A)'
  3909. must exist in the write cache, which is a contradiction.
  3910. */
  3911. self.read_only_accounts_cache
  3912. .store(*pubkey, slot, account.clone());
  3913. }
  3914. Some((account, slot))
  3915. }
  3916. /// if 'load_into_read_cache_only', then return value is meaningless.
  3917. /// The goal is to get the account into the read-only cache.
  3918. fn do_load_with_populate_read_cache(
  3919. &self,
  3920. ancestors: &Ancestors,
  3921. pubkey: &Pubkey,
  3922. max_root: Option<Slot>,
  3923. load_hint: LoadHint,
  3924. load_into_read_cache_only: bool,
  3925. load_zero_lamports: LoadZeroLamports,
  3926. ) -> Option<(AccountSharedData, Slot)> {
  3927. #[cfg(not(test))]
  3928. assert!(max_root.is_none());
  3929. let starting_max_root = self.accounts_index.max_root_inclusive();
  3930. let (slot, storage_location, _maybe_account_accessor) =
  3931. self.read_index_for_accessor_or_load_slow(ancestors, pubkey, max_root, false)?;
  3932. // Notice the subtle `?` at previous line, we bail out pretty early if missing.
  3933. let in_write_cache = storage_location.is_cached();
  3934. if !load_into_read_cache_only {
  3935. if !in_write_cache {
  3936. let result = self.read_only_accounts_cache.load(*pubkey, slot);
  3937. if let Some(account) = result {
  3938. if load_zero_lamports == LoadZeroLamports::None && account.is_zero_lamport() {
  3939. return None;
  3940. }
  3941. return Some((account, slot));
  3942. }
  3943. }
  3944. } else {
  3945. // goal is to load into read cache
  3946. if in_write_cache {
  3947. // no reason to load in read cache. already in write cache
  3948. return None;
  3949. }
  3950. if self.read_only_accounts_cache.in_cache(pubkey, slot) {
  3951. // already in read cache
  3952. return None;
  3953. }
  3954. }
  3955. let (mut account_accessor, slot) = self.retry_to_get_account_accessor(
  3956. slot,
  3957. storage_location,
  3958. ancestors,
  3959. pubkey,
  3960. max_root,
  3961. load_hint,
  3962. )?;
  3963. // note that the account being in the cache could be different now than it was previously
  3964. // since the cache could be flushed in between the 2 calls.
  3965. let in_write_cache = matches!(account_accessor, LoadedAccountAccessor::Cached(_));
  3966. let account = account_accessor.check_and_get_loaded_account_shared_data();
  3967. if load_zero_lamports == LoadZeroLamports::None && account.is_zero_lamport() {
  3968. return None;
  3969. }
  3970. if !in_write_cache && load_hint != LoadHint::FixedMaxRootDoNotPopulateReadCache {
  3971. /*
  3972. We show this store into the read-only cache for account 'A' and future loads of 'A' from the read-only cache are
  3973. safe/reflect 'A''s latest state on this fork.
  3974. This safety holds if during replay of slot 'S', we show we only read 'A' from the write cache,
  3975. not the read-only cache, after it's been updated in replay of slot 'S'.
  3976. Assume for contradiction this is not true, and we read 'A' from the read-only cache *after* it had been updated in 'S'.
  3977. This means an entry '(S, A)' was added to the read-only cache after 'A' had been updated in 'S'.
  3978. Now when '(S, A)' was being added to the read-only cache, it must have been true that 'is_cache == false',
  3979. which means '(S', A)' does not exist in the write cache yet.
  3980. However, by the assumption for contradiction above , 'A' has already been updated in 'S' which means '(S, A)'
  3981. must exist in the write cache, which is a contradiction.
  3982. */
  3983. self.read_only_accounts_cache
  3984. .store(*pubkey, slot, account.clone());
  3985. }
  3986. if load_hint == LoadHint::FixedMaxRoot
  3987. || load_hint == LoadHint::FixedMaxRootDoNotPopulateReadCache
  3988. {
  3989. // If the load hint is that the max root is fixed, the max root should be fixed.
  3990. let ending_max_root = self.accounts_index.max_root_inclusive();
  3991. if starting_max_root != ending_max_root {
  3992. warn!(
  3993. "do_load_with_populate_read_cache() scanning pubkey {pubkey} called with \
  3994. fixed max root, but max root changed from {starting_max_root} to \
  3995. {ending_max_root} during function call"
  3996. );
  3997. }
  3998. }
  3999. Some((account, slot))
  4000. }
  4001. fn get_account_accessor<'a>(
  4002. &'a self,
  4003. slot: Slot,
  4004. pubkey: &'a Pubkey,
  4005. storage_location: &StorageLocation,
  4006. ) -> LoadedAccountAccessor<'a> {
  4007. match storage_location {
  4008. StorageLocation::Cached => {
  4009. let maybe_cached_account = self.accounts_cache.load(slot, pubkey).map(Cow::Owned);
  4010. LoadedAccountAccessor::Cached(maybe_cached_account)
  4011. }
  4012. StorageLocation::AppendVec(store_id, offset) => {
  4013. let maybe_storage_entry = self
  4014. .storage
  4015. .get_account_storage_entry(slot, *store_id)
  4016. .map(|account_storage_entry| (account_storage_entry, *offset));
  4017. LoadedAccountAccessor::Stored(maybe_storage_entry)
  4018. }
  4019. }
  4020. }
  4021. fn create_store(
  4022. &self,
  4023. slot: Slot,
  4024. size: u64,
  4025. from: &str,
  4026. paths: &[PathBuf],
  4027. ) -> Arc<AccountStorageEntry> {
  4028. self.stats
  4029. .create_store_count
  4030. .fetch_add(1, Ordering::Relaxed);
  4031. let path_index = thread_rng().gen_range(0..paths.len());
  4032. let store = Arc::new(self.new_storage_entry(slot, Path::new(&paths[path_index]), size));
  4033. debug!(
  4034. "creating store: {} slot: {} len: {} size: {} from: {} path: {}",
  4035. store.id(),
  4036. slot,
  4037. store.accounts.len(),
  4038. store.accounts.capacity(),
  4039. from,
  4040. store.accounts.path().display(),
  4041. );
  4042. store
  4043. }
  4044. fn create_and_insert_store(
  4045. &self,
  4046. slot: Slot,
  4047. size: u64,
  4048. from: &str,
  4049. ) -> Arc<AccountStorageEntry> {
  4050. self.create_and_insert_store_with_paths(slot, size, from, &self.paths)
  4051. }
  4052. fn create_and_insert_store_with_paths(
  4053. &self,
  4054. slot: Slot,
  4055. size: u64,
  4056. from: &str,
  4057. paths: &[PathBuf],
  4058. ) -> Arc<AccountStorageEntry> {
  4059. let store = self.create_store(slot, size, from, paths);
  4060. let store_for_index = store.clone();
  4061. self.insert_store(slot, store_for_index);
  4062. store
  4063. }
  4064. fn insert_store(&self, slot: Slot, store: Arc<AccountStorageEntry>) {
  4065. self.storage.insert(slot, store)
  4066. }
  4067. pub fn enable_bank_drop_callback(&self) {
  4068. self.is_bank_drop_callback_enabled
  4069. .store(true, Ordering::Release);
  4070. }
  4071. /// This should only be called after the `Bank::drop()` runs in bank.rs, See BANK_DROP_SAFETY
  4072. /// comment below for more explanation.
  4073. /// * `is_serialized_with_abs` - indicates whether this call runs sequentially
  4074. /// with all other accounts_db relevant calls, such as shrinking, purging etc.,
  4075. /// in accounts background service.
  4076. pub fn purge_slot(&self, slot: Slot, bank_id: BankId, is_serialized_with_abs: bool) {
  4077. if self.is_bank_drop_callback_enabled.load(Ordering::Acquire) && !is_serialized_with_abs {
  4078. panic!(
  4079. "bad drop callpath detected; Bank::drop() must run serially with other logic in \
  4080. ABS like clean_accounts()"
  4081. )
  4082. }
  4083. // BANK_DROP_SAFETY: Because this function only runs once the bank is dropped,
  4084. // we know that there are no longer any ongoing scans on this bank, because scans require
  4085. // and hold a reference to the bank at the tip of the fork they're scanning. Hence it's
  4086. // safe to remove this bank_id from the `removed_bank_ids` list at this point.
  4087. if self
  4088. .accounts_index
  4089. .removed_bank_ids
  4090. .lock()
  4091. .unwrap()
  4092. .remove(&bank_id)
  4093. {
  4094. // If this slot was already cleaned up, no need to do any further cleans
  4095. return;
  4096. }
  4097. self.purge_slots(std::iter::once(&slot));
  4098. }
  4099. /// Purges every slot in `removed_slots` from both the cache and storage. This includes
  4100. /// entries in the accounts index, cache entries, and any backing storage entries.
  4101. pub fn purge_slots_from_cache_and_store<'a>(
  4102. &self,
  4103. removed_slots: impl Iterator<Item = &'a Slot> + Clone,
  4104. purge_stats: &PurgeStats,
  4105. ) {
  4106. let mut remove_cache_elapsed_across_slots = 0;
  4107. let mut num_cached_slots_removed = 0;
  4108. let mut total_removed_cached_bytes = 0;
  4109. for remove_slot in removed_slots {
  4110. // This function is only currently safe with respect to `flush_slot_cache()` because
  4111. // both functions run serially in AccountsBackgroundService.
  4112. let mut remove_cache_elapsed = Measure::start("remove_cache_elapsed");
  4113. // Note: we cannot remove this slot from the slot cache until we've removed its
  4114. // entries from the accounts index first. This is because `scan_accounts()` relies on
  4115. // holding the index lock, finding the index entry, and then looking up the entry
  4116. // in the cache. If it fails to find that entry, it will panic in `get_loaded_account()`
  4117. if let Some(slot_cache) = self.accounts_cache.slot_cache(*remove_slot) {
  4118. // If the slot is still in the cache, remove the backing storages for
  4119. // the slot and from the Accounts Index
  4120. num_cached_slots_removed += 1;
  4121. total_removed_cached_bytes += slot_cache.total_bytes();
  4122. self.purge_slot_cache(*remove_slot, &slot_cache);
  4123. remove_cache_elapsed.stop();
  4124. remove_cache_elapsed_across_slots += remove_cache_elapsed.as_us();
  4125. // Nobody else should have removed the slot cache entry yet
  4126. assert!(self.accounts_cache.remove_slot(*remove_slot).is_some());
  4127. } else {
  4128. self.purge_slot_storage(*remove_slot, purge_stats);
  4129. }
  4130. // It should not be possible that a slot is neither in the cache or storage. Even in
  4131. // a slot with all ticks, `Bank::new_from_parent()` immediately stores some sysvars
  4132. // on bank creation.
  4133. }
  4134. purge_stats
  4135. .remove_cache_elapsed
  4136. .fetch_add(remove_cache_elapsed_across_slots, Ordering::Relaxed);
  4137. purge_stats
  4138. .num_cached_slots_removed
  4139. .fetch_add(num_cached_slots_removed, Ordering::Relaxed);
  4140. purge_stats
  4141. .total_removed_cached_bytes
  4142. .fetch_add(total_removed_cached_bytes, Ordering::Relaxed);
  4143. }
  4144. /// Purge the backing storage entries for the given slot, does not purge from
  4145. /// the cache!
  4146. fn purge_dead_slots_from_storage<'a>(
  4147. &'a self,
  4148. removed_slots: impl Iterator<Item = &'a Slot> + Clone,
  4149. purge_stats: &PurgeStats,
  4150. ) {
  4151. // Check all slots `removed_slots` are no longer "relevant" roots.
  4152. // Note that the slots here could have been rooted slots, but if they're passed here
  4153. // for removal it means:
  4154. // 1) All updates in that old root have been outdated by updates in newer roots
  4155. // 2) Those slots/roots should have already been purged from the accounts index root
  4156. // tracking metadata via `accounts_index.clean_dead_slot()`.
  4157. let mut safety_checks_elapsed = Measure::start("safety_checks_elapsed");
  4158. assert!(self
  4159. .accounts_index
  4160. .get_rooted_from_list(removed_slots.clone())
  4161. .is_empty());
  4162. safety_checks_elapsed.stop();
  4163. purge_stats
  4164. .safety_checks_elapsed
  4165. .fetch_add(safety_checks_elapsed.as_us(), Ordering::Relaxed);
  4166. let mut total_removed_stored_bytes = 0;
  4167. let mut all_removed_slot_storages = vec![];
  4168. let mut remove_storage_entries_elapsed = Measure::start("remove_storage_entries_elapsed");
  4169. for remove_slot in removed_slots {
  4170. // Remove the storage entries and collect some metrics
  4171. if let Some(store) = self.storage.remove(remove_slot, false) {
  4172. total_removed_stored_bytes += store.accounts.capacity();
  4173. all_removed_slot_storages.push(store);
  4174. }
  4175. }
  4176. remove_storage_entries_elapsed.stop();
  4177. let num_stored_slots_removed = all_removed_slot_storages.len();
  4178. // Backing mmaps for removed storages entries explicitly dropped here outside
  4179. // of any locks
  4180. let mut drop_storage_entries_elapsed = Measure::start("drop_storage_entries_elapsed");
  4181. drop(all_removed_slot_storages);
  4182. drop_storage_entries_elapsed.stop();
  4183. purge_stats
  4184. .remove_storage_entries_elapsed
  4185. .fetch_add(remove_storage_entries_elapsed.as_us(), Ordering::Relaxed);
  4186. purge_stats
  4187. .drop_storage_entries_elapsed
  4188. .fetch_add(drop_storage_entries_elapsed.as_us(), Ordering::Relaxed);
  4189. purge_stats
  4190. .num_stored_slots_removed
  4191. .fetch_add(num_stored_slots_removed, Ordering::Relaxed);
  4192. purge_stats
  4193. .total_removed_storage_entries
  4194. .fetch_add(num_stored_slots_removed, Ordering::Relaxed);
  4195. purge_stats
  4196. .total_removed_stored_bytes
  4197. .fetch_add(total_removed_stored_bytes, Ordering::Relaxed);
  4198. self.stats
  4199. .dropped_stores
  4200. .fetch_add(num_stored_slots_removed as u64, Ordering::Relaxed);
  4201. }
  4202. fn purge_slot_cache(&self, purged_slot: Slot, slot_cache: &SlotCache) {
  4203. let pubkeys = slot_cache.iter().map(|account| *account.key());
  4204. self.purge_slot_cache_pubkeys(purged_slot, pubkeys, true);
  4205. }
  4206. fn purge_slot_cache_pubkeys(
  4207. &self,
  4208. purged_slot: Slot,
  4209. pubkeys: impl IntoIterator<Item = Pubkey>,
  4210. is_dead: bool,
  4211. ) {
  4212. // Slot purged from cache should not exist in the backing store
  4213. assert!(self
  4214. .storage
  4215. .get_slot_storage_entry_shrinking_in_progress_ok(purged_slot)
  4216. .is_none());
  4217. let mut num_purged_keys = 0;
  4218. let (reclaims, _) = self.purge_keys_exact(pubkeys.into_iter().map(|key| {
  4219. num_purged_keys += 1;
  4220. (key, purged_slot)
  4221. }));
  4222. assert_eq!(reclaims.len(), num_purged_keys);
  4223. if is_dead {
  4224. self.remove_dead_slots_metadata(std::iter::once(&purged_slot));
  4225. }
  4226. }
  4227. fn purge_slot_storage(&self, remove_slot: Slot, purge_stats: &PurgeStats) {
  4228. // Because AccountsBackgroundService synchronously flushes from the accounts cache
  4229. // and handles all Bank::drop() (the cleanup function that leads to this
  4230. // function call), then we don't need to worry above an overlapping cache flush
  4231. // with this function call. This means, if we get into this case, we can be
  4232. // confident that the entire state for this slot has been flushed to the storage
  4233. // already.
  4234. let mut scan_storages_elapsed = Measure::start("scan_storages_elapsed");
  4235. let mut stored_keys = HashSet::new();
  4236. if let Some(storage) = self
  4237. .storage
  4238. .get_slot_storage_entry_shrinking_in_progress_ok(remove_slot)
  4239. {
  4240. storage
  4241. .accounts
  4242. .scan_pubkeys(|pk| {
  4243. stored_keys.insert((*pk, remove_slot));
  4244. })
  4245. .expect("must scan accounts storage");
  4246. }
  4247. scan_storages_elapsed.stop();
  4248. purge_stats
  4249. .scan_storages_elapsed
  4250. .fetch_add(scan_storages_elapsed.as_us(), Ordering::Relaxed);
  4251. let mut purge_accounts_index_elapsed = Measure::start("purge_accounts_index_elapsed");
  4252. // Purge this slot from the accounts index
  4253. let (reclaims, pubkeys_removed_from_accounts_index) = self.purge_keys_exact(stored_keys);
  4254. purge_accounts_index_elapsed.stop();
  4255. purge_stats
  4256. .purge_accounts_index_elapsed
  4257. .fetch_add(purge_accounts_index_elapsed.as_us(), Ordering::Relaxed);
  4258. // `handle_reclaims()` should remove all the account index entries and
  4259. // storage entries
  4260. let mut handle_reclaims_elapsed = Measure::start("handle_reclaims_elapsed");
  4261. // Slot should be dead after removing all its account entries
  4262. // There is no reason to mark accounts obsolete as the slot storage is being purged
  4263. let expected_dead_slot = Some(remove_slot);
  4264. if !reclaims.is_empty() {
  4265. self.handle_reclaims(
  4266. reclaims.iter(),
  4267. expected_dead_slot,
  4268. &pubkeys_removed_from_accounts_index,
  4269. HandleReclaims::ProcessDeadSlots(purge_stats),
  4270. MarkAccountsObsolete::No,
  4271. );
  4272. }
  4273. handle_reclaims_elapsed.stop();
  4274. purge_stats
  4275. .handle_reclaims_elapsed
  4276. .fetch_add(handle_reclaims_elapsed.as_us(), Ordering::Relaxed);
  4277. // After handling the reclaimed entries, this slot's
  4278. // storage entries should be purged from self.storage
  4279. assert!(
  4280. self.storage.get_slot_storage_entry(remove_slot).is_none(),
  4281. "slot {remove_slot} is not none"
  4282. );
  4283. }
  4284. fn purge_slots<'a>(&self, slots: impl Iterator<Item = &'a Slot> + Clone) {
  4285. // `add_root()` should be called first
  4286. let mut safety_checks_elapsed = Measure::start("safety_checks_elapsed");
  4287. let non_roots = slots
  4288. // Only safe to check when there are duplicate versions of a slot
  4289. // because ReplayStage will not make new roots before dumping the
  4290. // duplicate slots first. Thus we will not be in a case where we
  4291. // root slot `S`, then try to dump some other version of slot `S`, the
  4292. // dumping has to finish first
  4293. //
  4294. // Also note roots are never removed via `remove_unrooted_slot()`, so
  4295. // it's safe to filter them out here as they won't need deletion from
  4296. // self.accounts_index.removed_bank_ids in `purge_slots_from_cache_and_store()`.
  4297. .filter(|slot| !self.accounts_index.is_alive_root(**slot));
  4298. safety_checks_elapsed.stop();
  4299. self.external_purge_slots_stats
  4300. .safety_checks_elapsed
  4301. .fetch_add(safety_checks_elapsed.as_us(), Ordering::Relaxed);
  4302. self.purge_slots_from_cache_and_store(non_roots, &self.external_purge_slots_stats);
  4303. self.external_purge_slots_stats
  4304. .report("external_purge_slots_stats", Some(1000));
  4305. }
  4306. pub fn remove_unrooted_slots(&self, remove_slots: &[(Slot, BankId)]) {
  4307. let rooted_slots = self
  4308. .accounts_index
  4309. .get_rooted_from_list(remove_slots.iter().map(|(slot, _)| slot));
  4310. assert!(
  4311. rooted_slots.is_empty(),
  4312. "Trying to remove accounts for rooted slots {rooted_slots:?}"
  4313. );
  4314. let RemoveUnrootedSlotsSynchronization {
  4315. slots_under_contention,
  4316. signal,
  4317. } = &self.remove_unrooted_slots_synchronization;
  4318. {
  4319. // Slots that are currently being flushed by flush_slot_cache()
  4320. let mut currently_contended_slots = slots_under_contention.lock().unwrap();
  4321. // Slots that are currently being flushed by flush_slot_cache() AND
  4322. // we want to remove in this function
  4323. let mut remaining_contended_flush_slots: Vec<Slot> = remove_slots
  4324. .iter()
  4325. .filter_map(|(remove_slot, _)| {
  4326. // Reserve the slots that we want to purge that aren't currently
  4327. // being flushed to prevent cache from flushing those slots in
  4328. // the future.
  4329. //
  4330. // Note that the single replay thread has to remove a specific slot `N`
  4331. // before another version of the same slot can be replayed. This means
  4332. // multiple threads should not call `remove_unrooted_slots()` simultaneously
  4333. // with the same slot.
  4334. let is_being_flushed = !currently_contended_slots.insert(*remove_slot);
  4335. // If the cache is currently flushing this slot, add it to the list
  4336. is_being_flushed.then_some(remove_slot)
  4337. })
  4338. .cloned()
  4339. .collect();
  4340. // Wait for cache flushes to finish
  4341. loop {
  4342. if !remaining_contended_flush_slots.is_empty() {
  4343. // Wait for the signal that the cache has finished flushing a slot
  4344. //
  4345. // Don't wait if the remaining_contended_flush_slots is empty, otherwise
  4346. // we may never get a signal since there's no cache flush thread to
  4347. // do the signaling
  4348. currently_contended_slots = signal.wait(currently_contended_slots).unwrap();
  4349. } else {
  4350. // There are no slots being flushed to wait on, so it's safe to continue
  4351. // to purging the slots we want to purge!
  4352. break;
  4353. }
  4354. // For each slot the cache flush has finished, mark that we're about to start
  4355. // purging these slots by reserving it in `currently_contended_slots`.
  4356. remaining_contended_flush_slots.retain(|flush_slot| {
  4357. // returns true if slot was already in set. This means slot is being flushed
  4358. !currently_contended_slots.insert(*flush_slot)
  4359. });
  4360. }
  4361. }
  4362. // Mark down these slots are about to be purged so that new attempts to scan these
  4363. // banks fail, and any ongoing scans over these slots will detect that they should abort
  4364. // their results
  4365. {
  4366. let mut locked_removed_bank_ids = self.accounts_index.removed_bank_ids.lock().unwrap();
  4367. for (_slot, remove_bank_id) in remove_slots.iter() {
  4368. locked_removed_bank_ids.insert(*remove_bank_id);
  4369. }
  4370. }
  4371. let remove_unrooted_purge_stats = PurgeStats::default();
  4372. self.purge_slots_from_cache_and_store(
  4373. remove_slots.iter().map(|(slot, _)| slot),
  4374. &remove_unrooted_purge_stats,
  4375. );
  4376. remove_unrooted_purge_stats.report("remove_unrooted_slots_purge_slots_stats", None);
  4377. let mut currently_contended_slots = slots_under_contention.lock().unwrap();
  4378. for (remove_slot, _) in remove_slots {
  4379. assert!(currently_contended_slots.remove(remove_slot));
  4380. }
  4381. }
  4382. /// Calculates the `AccountLtHash` of `account`
  4383. pub fn lt_hash_account(account: &impl ReadableAccount, pubkey: &Pubkey) -> AccountLtHash {
  4384. if account.lamports() == 0 {
  4385. return ZERO_LAMPORT_ACCOUNT_LT_HASH;
  4386. }
  4387. let hasher = Self::hash_account_helper(account, pubkey);
  4388. let lt_hash = LtHash::with(&hasher);
  4389. AccountLtHash(lt_hash)
  4390. }
  4391. /// Hashes `account` and returns the underlying Hasher
  4392. fn hash_account_helper(account: &impl ReadableAccount, pubkey: &Pubkey) -> blake3::Hasher {
  4393. let mut hasher = blake3::Hasher::new();
  4394. // allocate a buffer on the stack that's big enough
  4395. // to hold a token account or a stake account
  4396. const META_SIZE: usize = 8 /* lamports */ + 1 /* executable */ + 32 /* owner */ + 32 /* pubkey */;
  4397. const DATA_SIZE: usize = 200; // stake accounts are 200 B and token accounts are 165-182ish B
  4398. const BUFFER_SIZE: usize = META_SIZE + DATA_SIZE;
  4399. let mut buffer = SmallVec::<[u8; BUFFER_SIZE]>::new();
  4400. // collect lamports into buffer to hash
  4401. buffer.extend_from_slice(&account.lamports().to_le_bytes());
  4402. let data = account.data();
  4403. if data.len() > DATA_SIZE {
  4404. // For larger accounts whose data can't fit into the buffer, update the hash now.
  4405. hasher.update(&buffer);
  4406. buffer.clear();
  4407. // hash account's data
  4408. hasher.update(data);
  4409. } else {
  4410. // For small accounts whose data can fit into the buffer, append it to the buffer.
  4411. buffer.extend_from_slice(data);
  4412. }
  4413. // collect executable, owner, and pubkey into buffer to hash
  4414. buffer.push(account.executable().into());
  4415. buffer.extend_from_slice(account.owner().as_ref());
  4416. buffer.extend_from_slice(pubkey.as_ref());
  4417. hasher.update(&buffer);
  4418. hasher
  4419. }
  4420. pub fn mark_slot_frozen(&self, slot: Slot) {
  4421. if let Some(slot_cache) = self.accounts_cache.slot_cache(slot) {
  4422. slot_cache.mark_slot_frozen();
  4423. slot_cache.report_slot_store_metrics();
  4424. }
  4425. self.accounts_cache.report_size();
  4426. }
  4427. /// true if write cache is too big
  4428. fn should_aggressively_flush_cache(&self) -> bool {
  4429. self.write_cache_limit_bytes
  4430. .unwrap_or(WRITE_CACHE_LIMIT_BYTES_DEFAULT)
  4431. < self.accounts_cache.size()
  4432. }
  4433. // `force_flush` flushes all the cached roots `<= requested_flush_root`. It also then
  4434. // flushes:
  4435. // 1) excess remaining roots or unrooted slots while 'should_aggressively_flush_cache' is true
  4436. pub fn flush_accounts_cache(&self, force_flush: bool, requested_flush_root: Option<Slot>) {
  4437. #[cfg(not(test))]
  4438. assert!(requested_flush_root.is_some());
  4439. if !force_flush && !self.should_aggressively_flush_cache() {
  4440. return;
  4441. }
  4442. // Flush only the roots <= requested_flush_root, so that snapshotting has all
  4443. // the relevant roots in storage.
  4444. let mut flush_roots_elapsed = Measure::start("flush_roots_elapsed");
  4445. let _guard = self.active_stats.activate(ActiveStatItem::Flush);
  4446. // Note even if force_flush is false, we will still flush all roots <= the
  4447. // given `requested_flush_root`, even if some of the later roots cannot be used for
  4448. // cleaning due to an ongoing scan
  4449. let (total_new_cleaned_roots, num_cleaned_roots_flushed, mut flush_stats) = self
  4450. .flush_rooted_accounts_cache(
  4451. requested_flush_root,
  4452. true, // should_clean
  4453. );
  4454. flush_roots_elapsed.stop();
  4455. // Note we don't purge unrooted slots here because there may be ongoing scans/references
  4456. // for those slot, let the Bank::drop() implementation do cleanup instead on dead
  4457. // banks
  4458. // If 'should_aggressively_flush_cache', then flush the excess ones to storage
  4459. let (total_new_excess_roots, num_excess_roots_flushed, flush_stats_aggressively) =
  4460. if self.should_aggressively_flush_cache() {
  4461. // Start by flushing the roots
  4462. //
  4463. // Cannot do any cleaning on roots past `requested_flush_root` because future
  4464. // snapshots may need updates from those later slots, hence we pass `false`
  4465. // for `should_clean`.
  4466. self.flush_rooted_accounts_cache(None, false)
  4467. } else {
  4468. (0, 0, FlushStats::default())
  4469. };
  4470. flush_stats.accumulate(&flush_stats_aggressively);
  4471. let mut excess_slot_count = 0;
  4472. let mut unflushable_unrooted_slot_count = 0;
  4473. let max_flushed_root = self.accounts_cache.fetch_max_flush_root();
  4474. if self.should_aggressively_flush_cache() {
  4475. let mut old_slots = self.accounts_cache.cached_frozen_slots();
  4476. old_slots.sort_unstable();
  4477. excess_slot_count = old_slots.len();
  4478. let mut flush_stats = FlushStats::default();
  4479. old_slots.into_iter().for_each(|old_slot| {
  4480. // Don't flush slots that are known to be unrooted
  4481. if old_slot > max_flushed_root {
  4482. if self.should_aggressively_flush_cache() {
  4483. if let Some(stats) = self.flush_slot_cache(old_slot) {
  4484. flush_stats.accumulate(&stats);
  4485. }
  4486. }
  4487. } else {
  4488. unflushable_unrooted_slot_count += 1;
  4489. }
  4490. });
  4491. datapoint_info!(
  4492. "accounts_db-flush_accounts_cache_aggressively",
  4493. (
  4494. "num_accounts_flushed",
  4495. flush_stats.num_accounts_flushed.0,
  4496. i64
  4497. ),
  4498. ("num_accounts_saved", flush_stats.num_accounts_purged.0, i64),
  4499. (
  4500. "account_bytes_flushed",
  4501. flush_stats.num_bytes_flushed.0,
  4502. i64
  4503. ),
  4504. ("account_bytes_saved", flush_stats.num_bytes_purged.0, i64),
  4505. ("total_cache_size", self.accounts_cache.size(), i64),
  4506. ("total_frozen_slots", excess_slot_count, i64),
  4507. ("total_slots", self.accounts_cache.num_slots(), i64),
  4508. );
  4509. }
  4510. datapoint_info!(
  4511. "accounts_db-flush_accounts_cache",
  4512. ("total_new_cleaned_roots", total_new_cleaned_roots, i64),
  4513. ("num_cleaned_roots_flushed", num_cleaned_roots_flushed, i64),
  4514. ("total_new_excess_roots", total_new_excess_roots, i64),
  4515. ("num_excess_roots_flushed", num_excess_roots_flushed, i64),
  4516. ("excess_slot_count", excess_slot_count, i64),
  4517. (
  4518. "unflushable_unrooted_slot_count",
  4519. unflushable_unrooted_slot_count,
  4520. i64
  4521. ),
  4522. ("flush_roots_elapsed", flush_roots_elapsed.as_us(), i64),
  4523. (
  4524. "account_bytes_flushed",
  4525. flush_stats.num_bytes_flushed.0,
  4526. i64
  4527. ),
  4528. (
  4529. "num_accounts_flushed",
  4530. flush_stats.num_accounts_flushed.0,
  4531. i64
  4532. ),
  4533. ("account_bytes_saved", flush_stats.num_bytes_purged.0, i64),
  4534. ("num_accounts_saved", flush_stats.num_accounts_purged.0, i64),
  4535. (
  4536. "store_accounts_total_us",
  4537. flush_stats.store_accounts_total_us.0,
  4538. i64
  4539. ),
  4540. (
  4541. "update_index_us",
  4542. flush_stats.store_accounts_timing.update_index_elapsed,
  4543. i64
  4544. ),
  4545. (
  4546. "store_accounts_elapsed_us",
  4547. flush_stats.store_accounts_timing.store_accounts_elapsed,
  4548. i64
  4549. ),
  4550. (
  4551. "handle_reclaims_elapsed_us",
  4552. flush_stats.store_accounts_timing.handle_reclaims_elapsed,
  4553. i64
  4554. ),
  4555. );
  4556. }
  4557. fn flush_rooted_accounts_cache(
  4558. &self,
  4559. requested_flush_root: Option<Slot>,
  4560. should_clean: bool,
  4561. ) -> (usize, usize, FlushStats) {
  4562. let max_clean_root = should_clean
  4563. .then(|| {
  4564. // If there is a long running scan going on, this could prevent any cleaning
  4565. // based on updates from slots > `max_clean_root`.
  4566. self.max_clean_root(requested_flush_root)
  4567. })
  4568. .flatten();
  4569. let mut written_accounts = HashSet::new();
  4570. // If `should_clean` is false, then`should_flush_f` will be None, which will cause
  4571. // `flush_slot_cache` to flush all accounts to storage without cleaning any accounts.
  4572. let mut should_flush_f = should_clean
  4573. .then(|| {
  4574. Some(move |&pubkey: &Pubkey| {
  4575. // if not in hashset, then not flushed previously, so flush it
  4576. written_accounts.insert(pubkey)
  4577. })
  4578. })
  4579. .flatten();
  4580. // Always flush up to `requested_flush_root`, which is necessary for things like snapshotting.
  4581. let flushed_roots: BTreeSet<Slot> = self.accounts_cache.clear_roots(requested_flush_root);
  4582. // Iterate from highest to lowest so that we don't need to flush earlier
  4583. // outdated updates in earlier roots
  4584. let mut num_roots_flushed = 0;
  4585. let mut flush_stats = FlushStats::default();
  4586. for &root in flushed_roots.iter().rev() {
  4587. if let Some(stats) =
  4588. self.flush_slot_cache_with_clean(root, should_flush_f.as_mut(), max_clean_root)
  4589. {
  4590. num_roots_flushed += 1;
  4591. flush_stats.accumulate(&stats);
  4592. }
  4593. }
  4594. // Note that self.flush_slot_cache_with_clean() can return None if the
  4595. // slot is already been flushed. This can happen if the cache is
  4596. // overwhelmed and we flushed some yet to be rooted frozen slots.
  4597. // However, independent of whether the last slot was actually flushed
  4598. // from the cache by the above loop, we should always update the
  4599. // `max_flush_root` to the max of the flushed roots, because that's
  4600. // max_flushed_root tracks the logical last root that was flushed to
  4601. // storage by snapshotting.
  4602. if let Some(&root) = flushed_roots.last() {
  4603. self.accounts_cache.set_max_flush_root(root);
  4604. }
  4605. let num_new_roots = flushed_roots.len();
  4606. (num_new_roots, num_roots_flushed, flush_stats)
  4607. }
  4608. fn do_flush_slot_cache(
  4609. &self,
  4610. slot: Slot,
  4611. slot_cache: &SlotCache,
  4612. mut should_flush_f: Option<&mut impl FnMut(&Pubkey) -> bool>,
  4613. max_clean_root: Option<Slot>,
  4614. ) -> FlushStats {
  4615. let mut flush_stats = FlushStats::default();
  4616. let iter_items: Vec<_> = slot_cache.iter().collect();
  4617. let mut pubkeys: Vec<Pubkey> = vec![];
  4618. if should_flush_f.is_some() {
  4619. if let Some(max_clean_root) = max_clean_root {
  4620. if slot > max_clean_root {
  4621. // Only if the root is greater than the `max_clean_root` do we
  4622. // have to prevent cleaning, otherwise, just default to `should_flush_f`
  4623. // for any slots <= `max_clean_root`
  4624. should_flush_f = None;
  4625. }
  4626. }
  4627. }
  4628. let accounts: Vec<(&Pubkey, &AccountSharedData)> = iter_items
  4629. .iter()
  4630. .filter_map(|iter_item| {
  4631. let key = iter_item.key();
  4632. let account = &iter_item.value().account;
  4633. let should_flush = should_flush_f
  4634. .as_mut()
  4635. .map(|should_flush_f| should_flush_f(key))
  4636. .unwrap_or(true);
  4637. if should_flush {
  4638. flush_stats.num_bytes_flushed +=
  4639. aligned_stored_size(account.data().len()) as u64;
  4640. flush_stats.num_accounts_flushed += 1;
  4641. Some((key, account))
  4642. } else {
  4643. // If we don't flush, we have to remove the entry from the
  4644. // index, since it's equivalent to purging
  4645. pubkeys.push(*key);
  4646. flush_stats.num_bytes_purged +=
  4647. aligned_stored_size(account.data().len()) as u64;
  4648. flush_stats.num_accounts_purged += 1;
  4649. None
  4650. }
  4651. })
  4652. .collect();
  4653. let is_dead_slot = accounts.is_empty();
  4654. // Remove the account index entries from earlier roots that are outdated by later roots.
  4655. // Safe because queries to the index will be reading updates from later roots.
  4656. self.purge_slot_cache_pubkeys(slot, pubkeys, is_dead_slot);
  4657. if !is_dead_slot {
  4658. // This ensures that all updates are written to an AppendVec, before any
  4659. // updates to the index happen, so anybody that sees a real entry in the index,
  4660. // will be able to find the account in storage
  4661. let flushed_store = self.create_and_insert_store(
  4662. slot,
  4663. flush_stats.num_bytes_flushed.0,
  4664. "flush_slot_cache",
  4665. );
  4666. // Use ReclaimOldSlots to reclaim old slots if marking obsolete accounts and cleaning
  4667. // Cleaning is enabled if `should_flush_f` is Some.
  4668. // should_flush_f is set to None when
  4669. // 1) There's an ongoing scan to avoid reclaiming accounts being scanned.
  4670. // 2) The slot is > max_clean_root to prevent unrooted slots from reclaiming rooted versions.
  4671. let reclaim_method = if self.mark_obsolete_accounts == MarkObsoleteAccounts::Enabled
  4672. && should_flush_f.is_some()
  4673. {
  4674. UpsertReclaim::ReclaimOldSlots
  4675. } else {
  4676. UpsertReclaim::IgnoreReclaims
  4677. };
  4678. let (store_accounts_timing_inner, store_accounts_total_inner_us) = measure_us!(self
  4679. ._store_accounts_frozen(
  4680. (slot, &accounts[..]),
  4681. &flushed_store,
  4682. reclaim_method,
  4683. UpdateIndexThreadSelection::PoolWithThreshold,
  4684. ));
  4685. flush_stats.store_accounts_timing = store_accounts_timing_inner;
  4686. flush_stats.store_accounts_total_us = Saturating(store_accounts_total_inner_us);
  4687. // If the above sizing function is correct, just one AppendVec is enough to hold
  4688. // all the data for the slot
  4689. assert!(self.storage.get_slot_storage_entry(slot).is_some());
  4690. self.reopen_storage_as_readonly_shrinking_in_progress_ok(slot);
  4691. }
  4692. // Remove this slot from the cache, which will to AccountsDb's new readers should look like an
  4693. // atomic switch from the cache to storage.
  4694. // There is some racy condition for existing readers who just has read exactly while
  4695. // flushing. That case is handled by retry_to_get_account_accessor()
  4696. assert!(self.accounts_cache.remove_slot(slot).is_some());
  4697. // Add `accounts` to uncleaned_pubkeys since we know they were written
  4698. // to a storage and should be visited by `clean`.
  4699. self.uncleaned_pubkeys
  4700. .entry(slot)
  4701. .or_default()
  4702. .extend(accounts.into_iter().map(|(pubkey, _account)| *pubkey));
  4703. flush_stats
  4704. }
  4705. /// flush all accounts in this slot
  4706. fn flush_slot_cache(&self, slot: Slot) -> Option<FlushStats> {
  4707. self.flush_slot_cache_with_clean(slot, None::<&mut fn(&_) -> bool>, None)
  4708. }
  4709. /// `should_flush_f` is an optional closure that determines whether a given
  4710. /// account should be flushed. Passing `None` will by default flush all
  4711. /// accounts
  4712. fn flush_slot_cache_with_clean(
  4713. &self,
  4714. slot: Slot,
  4715. should_flush_f: Option<&mut impl FnMut(&Pubkey) -> bool>,
  4716. max_clean_root: Option<Slot>,
  4717. ) -> Option<FlushStats> {
  4718. if self
  4719. .remove_unrooted_slots_synchronization
  4720. .slots_under_contention
  4721. .lock()
  4722. .unwrap()
  4723. .insert(slot)
  4724. {
  4725. // We have not seen this slot, flush it.
  4726. let flush_stats = self.accounts_cache.slot_cache(slot).map(|slot_cache| {
  4727. #[cfg(test)]
  4728. {
  4729. // Give some time for cache flushing to occur here for unit tests
  4730. sleep(Duration::from_millis(self.load_delay));
  4731. }
  4732. // Since we added the slot to `slots_under_contention` AND this slot
  4733. // still exists in the cache, we know the slot cannot be removed
  4734. // by any other threads past this point. We are now responsible for
  4735. // flushing this slot.
  4736. self.do_flush_slot_cache(slot, &slot_cache, should_flush_f, max_clean_root)
  4737. });
  4738. // Nobody else should have been purging this slot, so should not have been removed
  4739. // from `self.remove_unrooted_slots_synchronization`.
  4740. assert!(self
  4741. .remove_unrooted_slots_synchronization
  4742. .slots_under_contention
  4743. .lock()
  4744. .unwrap()
  4745. .remove(&slot));
  4746. // Signal to any threads blocked on `remove_unrooted_slots(slot)` that we have finished
  4747. // flushing
  4748. self.remove_unrooted_slots_synchronization
  4749. .signal
  4750. .notify_all();
  4751. flush_stats
  4752. } else {
  4753. // We have already seen this slot. It is already under flushing. Skip.
  4754. None
  4755. }
  4756. }
  4757. fn report_store_stats(&self) {
  4758. let mut total_count = 0;
  4759. let mut newest_slot = 0;
  4760. let mut oldest_slot = u64::MAX;
  4761. let mut total_bytes = 0;
  4762. let mut total_alive_bytes = 0;
  4763. for (slot, store) in self.storage.iter() {
  4764. total_count += 1;
  4765. newest_slot = std::cmp::max(newest_slot, slot);
  4766. oldest_slot = std::cmp::min(oldest_slot, slot);
  4767. total_alive_bytes += store.alive_bytes();
  4768. total_bytes += store.capacity();
  4769. }
  4770. info!(
  4771. "total_stores: {total_count}, newest_slot: {newest_slot}, oldest_slot: {oldest_slot}"
  4772. );
  4773. let total_alive_ratio = if total_bytes > 0 {
  4774. total_alive_bytes as f64 / total_bytes as f64
  4775. } else {
  4776. 0.
  4777. };
  4778. datapoint_info!(
  4779. "accounts_db-stores",
  4780. ("total_count", total_count, i64),
  4781. ("total_bytes", total_bytes, i64),
  4782. ("total_alive_bytes", total_alive_bytes, i64),
  4783. ("total_alive_ratio", total_alive_ratio, f64),
  4784. );
  4785. }
  4786. /// Calculates the accounts lt hash
  4787. ///
  4788. /// Only intended to be called at startup (or by tests).
  4789. /// Only intended to be used while testing the experimental accumulator hash.
  4790. pub fn calculate_accounts_lt_hash_at_startup_from_index(
  4791. &self,
  4792. ancestors: &Ancestors,
  4793. startup_slot: Slot,
  4794. ) -> AccountsLtHash {
  4795. // This impl iterates over all the index bins in parallel, and computes the lt hash
  4796. // sequentially per bin. Then afterwards reduces to a single lt hash.
  4797. // This implementation is quite fast. Runtime is about 150 seconds on mnb as of 10/2/2024.
  4798. // The sequential implementation took about 6,275 seconds!
  4799. // A different parallel implementation that iterated over the bins *sequentially* and then
  4800. // hashed the accounts *within* a bin in parallel took about 600 seconds. That impl uses
  4801. // less memory, as only a single index bin is loaded into mem at a time.
  4802. let lt_hash = self
  4803. .accounts_index
  4804. .account_maps
  4805. .par_iter()
  4806. .fold(
  4807. LtHash::identity,
  4808. |mut accumulator_lt_hash, accounts_index_bin| {
  4809. for pubkey in accounts_index_bin.keys() {
  4810. let account_lt_hash = self
  4811. .accounts_index
  4812. .get_with_and_then(
  4813. &pubkey,
  4814. Some(ancestors),
  4815. Some(startup_slot),
  4816. false,
  4817. |(slot, account_info)| {
  4818. (!account_info.is_zero_lamport()).then(|| {
  4819. self.get_account_accessor(
  4820. slot,
  4821. &pubkey,
  4822. &account_info.storage_location(),
  4823. )
  4824. .get_loaded_account(|loaded_account| {
  4825. Self::lt_hash_account(&loaded_account, &pubkey)
  4826. })
  4827. // SAFETY: The index said this pubkey exists, so
  4828. // there must be an account to load.
  4829. .unwrap()
  4830. })
  4831. },
  4832. )
  4833. .flatten();
  4834. if let Some(account_lt_hash) = account_lt_hash {
  4835. accumulator_lt_hash.mix_in(&account_lt_hash.0);
  4836. }
  4837. }
  4838. accumulator_lt_hash
  4839. },
  4840. )
  4841. .reduce(LtHash::identity, |mut accum, elem| {
  4842. accum.mix_in(&elem);
  4843. accum
  4844. });
  4845. AccountsLtHash(lt_hash)
  4846. }
  4847. /// Calculates the capitalization
  4848. ///
  4849. /// Panics if capitalization overflows a u64.
  4850. ///
  4851. /// Note, this is *very* expensive! It walks the whole accounts index,
  4852. /// account-by-account, summing each account's balance.
  4853. ///
  4854. /// Only intended to be called at startup by ledger-tool or tests.
  4855. pub fn calculate_capitalization_at_startup_from_index(
  4856. &self,
  4857. ancestors: &Ancestors,
  4858. startup_slot: Slot,
  4859. ) -> u64 {
  4860. self.accounts_index
  4861. .account_maps
  4862. .par_iter()
  4863. .map(|accounts_index_bin| {
  4864. accounts_index_bin
  4865. .keys()
  4866. .into_iter()
  4867. .map(|pubkey| {
  4868. self.accounts_index
  4869. .get_with_and_then(
  4870. &pubkey,
  4871. Some(ancestors),
  4872. Some(startup_slot),
  4873. false,
  4874. |(slot, account_info)| {
  4875. (!account_info.is_zero_lamport()).then(|| {
  4876. self.get_account_accessor(
  4877. slot,
  4878. &pubkey,
  4879. &account_info.storage_location(),
  4880. )
  4881. .get_loaded_account(|loaded_account| {
  4882. loaded_account.lamports()
  4883. })
  4884. // SAFETY: The index said this pubkey exists, so
  4885. // there must be an account to load.
  4886. .unwrap()
  4887. })
  4888. },
  4889. )
  4890. .flatten()
  4891. .unwrap_or(0)
  4892. })
  4893. .try_fold(0, u64::checked_add)
  4894. })
  4895. .try_reduce(|| 0, u64::checked_add)
  4896. .expect("capitalization cannot overflow")
  4897. }
  4898. /// return slot + offset, where offset can be +/-
  4899. fn apply_offset_to_slot(slot: Slot, offset: i64) -> Slot {
  4900. if offset > 0 {
  4901. slot.saturating_add(offset as u64)
  4902. } else {
  4903. slot.saturating_sub(offset.unsigned_abs())
  4904. }
  4905. }
  4906. /// Returns all of the accounts' pubkeys for a given slot
  4907. pub fn get_pubkeys_for_slot(&self, slot: Slot) -> Vec<Pubkey> {
  4908. let scan_result = self.scan_cache_storage_fallback(
  4909. slot,
  4910. |loaded_account| Some(*loaded_account.pubkey()),
  4911. |accum: &mut HashSet<Pubkey>, storage| {
  4912. storage
  4913. .scan_pubkeys(|pubkey| {
  4914. accum.insert(*pubkey);
  4915. })
  4916. .expect("must scan accounts storage");
  4917. },
  4918. );
  4919. match scan_result {
  4920. ScanStorageResult::Cached(cached_result) => cached_result,
  4921. ScanStorageResult::Stored(stored_result) => stored_result.into_iter().collect(),
  4922. }
  4923. }
  4924. /// Return all of the accounts for a given slot
  4925. pub fn get_pubkey_account_for_slot(&self, slot: Slot) -> Vec<(Pubkey, AccountSharedData)> {
  4926. let scan_result = self.scan_account_storage(
  4927. slot,
  4928. |loaded_account| {
  4929. // Cache only has one version per key, don't need to worry about versioning
  4930. Some((*loaded_account.pubkey(), loaded_account.take_account()))
  4931. },
  4932. |accum: &mut HashMap<_, _>, stored_account, data| {
  4933. // SAFETY: We called scan_account_storage() with
  4934. // ScanAccountStorageData::DataRefForStorage, so `data` must be Some.
  4935. let data = data.unwrap();
  4936. let loaded_account =
  4937. LoadedAccount::Stored(StoredAccountInfo::new_from(stored_account, data));
  4938. // Storage may have duplicates so only keep the latest version for each key
  4939. accum.insert(*loaded_account.pubkey(), loaded_account.take_account());
  4940. },
  4941. ScanAccountStorageData::DataRefForStorage,
  4942. );
  4943. match scan_result {
  4944. ScanStorageResult::Cached(cached_result) => cached_result,
  4945. ScanStorageResult::Stored(stored_result) => stored_result.into_iter().collect(),
  4946. }
  4947. }
  4948. /// Updates the accounts index with the given `infos` and `accounts`.
  4949. /// Returns a vector of `SlotList<AccountInfo>` containing the reclaims for each batch processed.
  4950. /// The element of the returned vector is guaranteed to be non-empty.
  4951. fn update_index<'a>(
  4952. &self,
  4953. infos: Vec<AccountInfo>,
  4954. accounts: &impl StorableAccounts<'a>,
  4955. reclaim: UpsertReclaim,
  4956. update_index_thread_selection: UpdateIndexThreadSelection,
  4957. thread_pool: &ThreadPool,
  4958. ) -> Vec<ReclaimsSlotList<AccountInfo>> {
  4959. let target_slot = accounts.target_slot();
  4960. let len = std::cmp::min(accounts.len(), infos.len());
  4961. // If reclaiming old slots, ensure the target slot is a root
  4962. // Having an unrooted slot reclaim a rooted version of a slot
  4963. // could lead to index corruption if the unrooted version is
  4964. // discarded
  4965. if reclaim == UpsertReclaim::ReclaimOldSlots {
  4966. assert!(target_slot <= self.accounts_index.max_root_inclusive());
  4967. }
  4968. let update = |start, end| {
  4969. let mut reclaims = ReclaimsSlotList::with_capacity((end - start) / 2);
  4970. (start..end).for_each(|i| {
  4971. let info = infos[i];
  4972. accounts.account(i, |account| {
  4973. let old_slot = accounts.slot(i);
  4974. self.accounts_index.upsert(
  4975. target_slot,
  4976. old_slot,
  4977. account.pubkey(),
  4978. &account,
  4979. &self.account_indexes,
  4980. info,
  4981. &mut reclaims,
  4982. reclaim,
  4983. );
  4984. });
  4985. });
  4986. reclaims
  4987. };
  4988. let threshold = 1;
  4989. if matches!(
  4990. update_index_thread_selection,
  4991. UpdateIndexThreadSelection::PoolWithThreshold,
  4992. ) && len > threshold
  4993. {
  4994. let chunk_size = std::cmp::max(1, len / quarter_thread_count()); // # pubkeys/thread
  4995. let batches = 1 + len / chunk_size;
  4996. thread_pool.install(|| {
  4997. (0..batches)
  4998. .into_par_iter()
  4999. .map(|batch| {
  5000. let start = batch * chunk_size;
  5001. let end = std::cmp::min(start + chunk_size, len);
  5002. update(start, end)
  5003. })
  5004. .filter(|reclaims| !reclaims.is_empty())
  5005. .collect()
  5006. })
  5007. } else {
  5008. let reclaims = update(0, len);
  5009. if reclaims.is_empty() {
  5010. // If no reclaims, return an empty vector
  5011. vec![]
  5012. } else {
  5013. vec![reclaims]
  5014. }
  5015. }
  5016. }
  5017. fn should_not_shrink(alive_bytes: u64, total_bytes: u64) -> bool {
  5018. alive_bytes >= total_bytes
  5019. }
  5020. fn is_shrinking_productive(store: &AccountStorageEntry) -> bool {
  5021. let alive_count = store.count();
  5022. let total_bytes = store.capacity();
  5023. let alive_bytes = store.alive_bytes_exclude_zero_lamport_single_ref_accounts() as u64;
  5024. if Self::should_not_shrink(alive_bytes, total_bytes) {
  5025. trace!(
  5026. "shrink_slot_forced ({}): not able to shrink at all: num alive: {}, bytes alive: \
  5027. {}, bytes total: {}, bytes saved: {}",
  5028. store.slot(),
  5029. alive_count,
  5030. alive_bytes,
  5031. total_bytes,
  5032. total_bytes.saturating_sub(alive_bytes),
  5033. );
  5034. return false;
  5035. }
  5036. true
  5037. }
  5038. /// Determines whether a given AccountStorageEntry instance is a
  5039. /// candidate for shrinking.
  5040. pub(crate) fn is_candidate_for_shrink(&self, store: &AccountStorageEntry) -> bool {
  5041. // appended ancient append vecs should not be shrunk by the normal shrink codepath.
  5042. // It is not possible to identify ancient append vecs when we pack, so no check for ancient when we are not appending.
  5043. let total_bytes = store.capacity();
  5044. let alive_bytes = store.alive_bytes_exclude_zero_lamport_single_ref_accounts() as u64;
  5045. match self.shrink_ratio {
  5046. AccountShrinkThreshold::TotalSpace { shrink_ratio: _ } => alive_bytes < total_bytes,
  5047. AccountShrinkThreshold::IndividualStore { shrink_ratio } => {
  5048. (alive_bytes as f64 / total_bytes as f64) < shrink_ratio
  5049. }
  5050. }
  5051. }
  5052. /// returns (dead slots, reclaimed_offsets)
  5053. fn remove_dead_accounts<'a, I>(
  5054. &'a self,
  5055. reclaims: I,
  5056. expected_slot: Option<Slot>,
  5057. mark_accounts_obsolete: MarkAccountsObsolete,
  5058. ) -> (IntSet<Slot>, SlotOffsets)
  5059. where
  5060. I: Iterator<Item = &'a (Slot, AccountInfo)>,
  5061. {
  5062. let mut reclaimed_offsets = SlotOffsets::default();
  5063. assert!(self.storage.no_shrink_in_progress());
  5064. let mut dead_slots = IntSet::default();
  5065. let mut new_shrink_candidates = ShrinkCandidates::default();
  5066. let mut measure = Measure::start("remove");
  5067. for (slot, account_info) in reclaims {
  5068. // No cached accounts should make it here
  5069. assert!(!account_info.is_cached());
  5070. reclaimed_offsets
  5071. .entry(*slot)
  5072. .or_default()
  5073. .insert(account_info.offset());
  5074. }
  5075. if let Some(expected_slot) = expected_slot {
  5076. assert_eq!(reclaimed_offsets.len(), 1);
  5077. assert!(reclaimed_offsets.contains_key(&expected_slot));
  5078. }
  5079. self.clean_accounts_stats
  5080. .slots_cleaned
  5081. .fetch_add(reclaimed_offsets.len() as u64, Ordering::Relaxed);
  5082. reclaimed_offsets.iter().for_each(|(slot, offsets)| {
  5083. if let Some(store) = self.storage.get_slot_storage_entry(*slot) {
  5084. assert_eq!(
  5085. *slot,
  5086. store.slot(),
  5087. "AccountsDB::accounts_index corrupted. Storage pointed to: {}, expected: {}, \
  5088. should only point to one slot",
  5089. store.slot(),
  5090. *slot
  5091. );
  5092. let remaining_accounts = if offsets.len() == store.count() {
  5093. // all remaining alive accounts in the storage are being removed, so the entire storage/slot is dead
  5094. store.remove_accounts(store.alive_bytes(), offsets.len())
  5095. } else {
  5096. // not all accounts are being removed, so figure out sizes of accounts we are removing and update the alive bytes and alive account count
  5097. let (remaining_accounts, us) = measure_us!({
  5098. let mut offsets = offsets.iter().cloned().collect::<Vec<_>>();
  5099. // sort so offsets are in order. This improves efficiency of loading the accounts.
  5100. offsets.sort_unstable();
  5101. let data_lens = store.accounts.get_account_data_lens(&offsets);
  5102. let dead_bytes = data_lens
  5103. .iter()
  5104. .map(|len| store.accounts.calculate_stored_size(*len))
  5105. .sum();
  5106. let remaining_accounts = store.remove_accounts(dead_bytes, offsets.len());
  5107. if let MarkAccountsObsolete::Yes(slot_marked_obsolete) =
  5108. mark_accounts_obsolete
  5109. {
  5110. store
  5111. .obsolete_accounts
  5112. .write()
  5113. .unwrap()
  5114. .mark_accounts_obsolete(
  5115. offsets.into_iter().zip(data_lens),
  5116. slot_marked_obsolete,
  5117. );
  5118. }
  5119. remaining_accounts
  5120. });
  5121. self.clean_accounts_stats
  5122. .get_account_sizes_us
  5123. .fetch_add(us, Ordering::Relaxed);
  5124. remaining_accounts
  5125. };
  5126. // Check if we have removed all accounts from the storage
  5127. // This may be different from the check above as this
  5128. // can be multithreaded
  5129. if remaining_accounts == 0 {
  5130. self.dirty_stores.insert(*slot, store);
  5131. dead_slots.insert(*slot);
  5132. } else if Self::is_shrinking_productive(&store)
  5133. && self.is_candidate_for_shrink(&store)
  5134. {
  5135. // Checking that this single storage entry is ready for shrinking,
  5136. // should be a sufficient indication that the slot is ready to be shrunk
  5137. // because slots should only have one storage entry, namely the one that was
  5138. // created by `flush_slot_cache()`.
  5139. new_shrink_candidates.insert(*slot);
  5140. };
  5141. }
  5142. });
  5143. measure.stop();
  5144. self.clean_accounts_stats
  5145. .remove_dead_accounts_remove_us
  5146. .fetch_add(measure.as_us(), Ordering::Relaxed);
  5147. let mut measure = Measure::start("shrink");
  5148. let mut shrink_candidate_slots = self.shrink_candidate_slots.lock().unwrap();
  5149. for slot in new_shrink_candidates {
  5150. shrink_candidate_slots.insert(slot);
  5151. }
  5152. drop(shrink_candidate_slots);
  5153. measure.stop();
  5154. self.clean_accounts_stats
  5155. .remove_dead_accounts_shrink_us
  5156. .fetch_add(measure.as_us(), Ordering::Relaxed);
  5157. dead_slots.retain(|slot| {
  5158. if let Some(slot_store) = self.storage.get_slot_storage_entry(*slot) {
  5159. if slot_store.count() != 0 {
  5160. return false;
  5161. }
  5162. }
  5163. true
  5164. });
  5165. (dead_slots, reclaimed_offsets)
  5166. }
  5167. fn remove_dead_slots_metadata<'a>(&'a self, dead_slots_iter: impl Iterator<Item = &'a Slot>) {
  5168. let mut measure = Measure::start("remove_dead_slots_metadata-ms");
  5169. self.clean_dead_slots_from_accounts_index(dead_slots_iter);
  5170. measure.stop();
  5171. inc_new_counter_info!("remove_dead_slots_metadata-ms", measure.as_ms() as usize);
  5172. }
  5173. /// lookup each pubkey in 'pubkeys' and unref it in the accounts index
  5174. /// skip pubkeys that are in 'pubkeys_removed_from_accounts_index'
  5175. fn unref_pubkeys<'a>(
  5176. &'a self,
  5177. pubkeys: impl Iterator<Item = &'a Pubkey> + Clone + Send + Sync,
  5178. num_pubkeys: usize,
  5179. pubkeys_removed_from_accounts_index: &'a PubkeysRemovedFromAccountsIndex,
  5180. ) {
  5181. let batches = 1 + (num_pubkeys / UNREF_ACCOUNTS_BATCH_SIZE);
  5182. self.thread_pool_background.install(|| {
  5183. (0..batches).into_par_iter().for_each(|batch| {
  5184. let skip = batch * UNREF_ACCOUNTS_BATCH_SIZE;
  5185. self.accounts_index.scan(
  5186. pubkeys
  5187. .clone()
  5188. .skip(skip)
  5189. .take(UNREF_ACCOUNTS_BATCH_SIZE)
  5190. .filter(|pubkey| {
  5191. // filter out pubkeys that have already been removed from the accounts index in a previous step
  5192. let already_removed =
  5193. pubkeys_removed_from_accounts_index.contains(pubkey);
  5194. !already_removed
  5195. }),
  5196. |_pubkey, slots_refs| {
  5197. if let Some((slot_list, ref_count)) = slots_refs {
  5198. // Let's handle the special case - after unref, the result is a single ref zero lamport account.
  5199. if slot_list.len() == 1 && ref_count == 2 {
  5200. if let Some((slot_alive, acct_info)) = slot_list.first() {
  5201. if acct_info.is_zero_lamport() && !acct_info.is_cached() {
  5202. self.zero_lamport_single_ref_found(
  5203. *slot_alive,
  5204. acct_info.offset(),
  5205. );
  5206. }
  5207. }
  5208. }
  5209. }
  5210. AccountsIndexScanResult::Unref
  5211. },
  5212. None,
  5213. ScanFilter::All,
  5214. )
  5215. });
  5216. });
  5217. }
  5218. /// lookup each pubkey in 'purged_slot_pubkeys' and unref it in the accounts index
  5219. /// populate 'purged_stored_account_slots' by grouping 'purged_slot_pubkeys' by pubkey
  5220. /// pubkeys_removed_from_accounts_index - These keys have already been removed from the accounts index
  5221. /// and should not be unref'd. If they exist in the accounts index, they are NEW.
  5222. fn unref_accounts(
  5223. &self,
  5224. purged_slot_pubkeys: HashSet<(Slot, Pubkey)>,
  5225. purged_stored_account_slots: &mut AccountSlots,
  5226. pubkeys_removed_from_accounts_index: &PubkeysRemovedFromAccountsIndex,
  5227. ) {
  5228. self.unref_pubkeys(
  5229. purged_slot_pubkeys.iter().map(|(_slot, pubkey)| pubkey),
  5230. purged_slot_pubkeys.len(),
  5231. pubkeys_removed_from_accounts_index,
  5232. );
  5233. for (slot, pubkey) in purged_slot_pubkeys {
  5234. purged_stored_account_slots
  5235. .entry(pubkey)
  5236. .or_default()
  5237. .insert(slot);
  5238. }
  5239. }
  5240. fn clean_dead_slots_from_accounts_index<'a>(
  5241. &'a self,
  5242. dead_slots_iter: impl Iterator<Item = &'a Slot>,
  5243. ) {
  5244. let mut accounts_index_root_stats = AccountsIndexRootsStats::default();
  5245. let mut measure = Measure::start("clean_dead_slot");
  5246. let mut rooted_cleaned_count = 0;
  5247. let mut unrooted_cleaned_count = 0;
  5248. let dead_slots: Vec<_> = dead_slots_iter
  5249. .map(|slot| {
  5250. if self.accounts_index.clean_dead_slot(*slot) {
  5251. rooted_cleaned_count += 1;
  5252. } else {
  5253. unrooted_cleaned_count += 1;
  5254. }
  5255. *slot
  5256. })
  5257. .collect();
  5258. measure.stop();
  5259. accounts_index_root_stats.clean_dead_slot_us += measure.as_us();
  5260. if self.log_dead_slots.load(Ordering::Relaxed) {
  5261. info!(
  5262. "remove_dead_slots_metadata: {} dead slots",
  5263. dead_slots.len()
  5264. );
  5265. trace!("remove_dead_slots_metadata: dead_slots: {dead_slots:?}");
  5266. }
  5267. self.accounts_index
  5268. .update_roots_stats(&mut accounts_index_root_stats);
  5269. accounts_index_root_stats.rooted_cleaned_count += rooted_cleaned_count;
  5270. accounts_index_root_stats.unrooted_cleaned_count += unrooted_cleaned_count;
  5271. self.clean_accounts_stats
  5272. .latest_accounts_index_roots_stats
  5273. .update(&accounts_index_root_stats);
  5274. }
  5275. /// pubkeys_removed_from_accounts_index - These keys have already been removed from the accounts index
  5276. /// and should not be unref'd. If they exist in the accounts index, they are NEW.
  5277. fn clean_stored_dead_slots(
  5278. &self,
  5279. dead_slots: &IntSet<Slot>,
  5280. purged_account_slots: Option<&mut AccountSlots>,
  5281. pubkeys_removed_from_accounts_index: &PubkeysRemovedFromAccountsIndex,
  5282. ) {
  5283. let mut measure = Measure::start("clean_stored_dead_slots-ms");
  5284. let mut stores = vec![];
  5285. // get all stores in a vec so we can iterate in parallel
  5286. for slot in dead_slots.iter() {
  5287. if let Some(slot_storage) = self.storage.get_slot_storage_entry(*slot) {
  5288. stores.push(slot_storage);
  5289. }
  5290. }
  5291. // get all pubkeys in all dead slots
  5292. let purged_slot_pubkeys: HashSet<(Slot, Pubkey)> = {
  5293. self.thread_pool_background.install(|| {
  5294. stores
  5295. .into_par_iter()
  5296. .map(|store| {
  5297. let slot = store.slot();
  5298. let mut pubkeys = Vec::with_capacity(store.count());
  5299. // Obsolete accounts are already unreffed before this point, so do not add
  5300. // them to the pubkeys list.
  5301. let obsolete_accounts: HashSet<_> = store
  5302. .obsolete_accounts_read_lock()
  5303. .filter_obsolete_accounts(None)
  5304. .collect();
  5305. store
  5306. .accounts
  5307. .scan_accounts_without_data(|offset, account| {
  5308. if !obsolete_accounts.contains(&(offset, account.data_len)) {
  5309. pubkeys.push((slot, *account.pubkey));
  5310. }
  5311. })
  5312. .expect("must scan accounts storage");
  5313. pubkeys
  5314. })
  5315. .flatten()
  5316. .collect::<HashSet<_>>()
  5317. })
  5318. };
  5319. //Unref the accounts from storage
  5320. let mut accounts_index_root_stats = AccountsIndexRootsStats::default();
  5321. let mut measure_unref = Measure::start("unref_from_storage");
  5322. if let Some(purged_account_slots) = purged_account_slots {
  5323. self.unref_accounts(
  5324. purged_slot_pubkeys,
  5325. purged_account_slots,
  5326. pubkeys_removed_from_accounts_index,
  5327. );
  5328. }
  5329. measure_unref.stop();
  5330. accounts_index_root_stats.clean_unref_from_storage_us += measure_unref.as_us();
  5331. self.clean_accounts_stats
  5332. .latest_accounts_index_roots_stats
  5333. .update(&accounts_index_root_stats);
  5334. measure.stop();
  5335. self.clean_accounts_stats
  5336. .clean_stored_dead_slots_us
  5337. .fetch_add(measure.as_us(), Ordering::Relaxed);
  5338. }
  5339. /// Stores accounts in the write cache and updates the index.
  5340. /// This should only be used for accounts that are unrooted (unfrozen)
  5341. pub(crate) fn store_accounts_unfrozen<'a>(
  5342. &self,
  5343. accounts: impl StorableAccounts<'a>,
  5344. transactions: Option<&'a [&'a SanitizedTransaction]>,
  5345. update_index_thread_selection: UpdateIndexThreadSelection,
  5346. ) {
  5347. // If all transactions in a batch are errored,
  5348. // it's possible to get a store with no accounts.
  5349. if accounts.is_empty() {
  5350. return;
  5351. }
  5352. let mut total_data = 0;
  5353. (0..accounts.len()).for_each(|index| {
  5354. total_data += accounts.data_len(index);
  5355. });
  5356. self.stats
  5357. .store_total_data
  5358. .fetch_add(total_data as u64, Ordering::Relaxed);
  5359. // Store the accounts in the write cache
  5360. let mut store_accounts_time = Measure::start("store_accounts");
  5361. let infos = self.write_accounts_to_cache(accounts.target_slot(), &accounts, transactions);
  5362. store_accounts_time.stop();
  5363. self.stats
  5364. .store_accounts
  5365. .fetch_add(store_accounts_time.as_us(), Ordering::Relaxed);
  5366. // Update the index
  5367. let mut update_index_time = Measure::start("update_index");
  5368. self.update_index(
  5369. infos,
  5370. &accounts,
  5371. UpsertReclaim::PreviousSlotEntryWasCached,
  5372. update_index_thread_selection,
  5373. &self.thread_pool_foreground,
  5374. );
  5375. update_index_time.stop();
  5376. self.stats
  5377. .store_update_index
  5378. .fetch_add(update_index_time.as_us(), Ordering::Relaxed);
  5379. self.stats
  5380. .store_num_accounts
  5381. .fetch_add(accounts.len() as u64, Ordering::Relaxed);
  5382. self.report_store_timings();
  5383. }
  5384. /// Stores accounts in the storage and updates the index.
  5385. /// This function is intended for accounts that are rooted (frozen).
  5386. /// - `UpsertReclaims` is set to `IgnoreReclaims`. If the slot in `accounts` differs from the new slot,
  5387. /// accounts may be removed from the account index. In such cases, the caller must ensure that alive
  5388. /// accounts are decremented for the older storage or that the old storage is removed entirely
  5389. pub fn store_accounts_frozen<'a>(
  5390. &self,
  5391. accounts: impl StorableAccounts<'a>,
  5392. storage: &Arc<AccountStorageEntry>,
  5393. update_index_thread_selection: UpdateIndexThreadSelection,
  5394. ) -> StoreAccountsTiming {
  5395. self._store_accounts_frozen(
  5396. accounts,
  5397. storage,
  5398. UpsertReclaim::IgnoreReclaims,
  5399. update_index_thread_selection,
  5400. )
  5401. }
  5402. /// Stores accounts in the storage and updates the index.
  5403. /// This function is intended for accounts that are rooted (frozen).
  5404. /// - `UpsertReclaims` must be set to `IgnoreReclaims` at this time
  5405. fn _store_accounts_frozen<'a>(
  5406. &self,
  5407. accounts: impl StorableAccounts<'a>,
  5408. storage: &Arc<AccountStorageEntry>,
  5409. reclaim_handling: UpsertReclaim,
  5410. update_index_thread_selection: UpdateIndexThreadSelection,
  5411. ) -> StoreAccountsTiming {
  5412. let slot = accounts.target_slot();
  5413. let mut store_accounts_time = Measure::start("store_accounts");
  5414. // Flush the read cache if necessary. This will occur during shrink or clean
  5415. if self.read_only_accounts_cache.can_slot_be_in_cache(slot) {
  5416. (0..accounts.len()).for_each(|index| {
  5417. // based on the patterns of how a validator writes accounts, it is almost always the case that there is no read only cache entry
  5418. // for this pubkey and slot. So, we can give that hint to the `remove` for performance.
  5419. self.read_only_accounts_cache
  5420. .remove_assume_not_present(accounts.pubkey(index));
  5421. });
  5422. }
  5423. // Write the accounts to storage
  5424. let infos = self.write_accounts_to_storage(slot, storage, &accounts);
  5425. store_accounts_time.stop();
  5426. self.stats
  5427. .store_accounts
  5428. .fetch_add(store_accounts_time.as_us(), Ordering::Relaxed);
  5429. self.mark_zero_lamport_single_ref_accounts(&infos, storage, reclaim_handling);
  5430. let mut update_index_time = Measure::start("update_index");
  5431. // If the cache was flushed, then because `update_index` occurs
  5432. // after the account are stored by the above `store_accounts_to`
  5433. // call and all the accounts are stored, all reads after this point
  5434. // will know to not check the cache anymore
  5435. let reclaims = self.update_index(
  5436. infos,
  5437. &accounts,
  5438. reclaim_handling,
  5439. update_index_thread_selection,
  5440. &self.thread_pool_background,
  5441. );
  5442. update_index_time.stop();
  5443. self.stats
  5444. .store_update_index
  5445. .fetch_add(update_index_time.as_us(), Ordering::Relaxed);
  5446. self.stats
  5447. .store_num_accounts
  5448. .fetch_add(accounts.len() as u64, Ordering::Relaxed);
  5449. // If there are any reclaims then they should be handled. Reclaims affect
  5450. // all storages, and may result in the removal of dead storages.
  5451. let mut handle_reclaims_elapsed = 0;
  5452. // since reclaims only contains non-empty SlotList<AccountInfo>, we
  5453. // should skip handle_reclaims only when reclaims is empty. No need to
  5454. // check the elements of reclaims are empty.
  5455. if !reclaims.is_empty() {
  5456. let reclaims_len = reclaims.iter().map(|r| r.len()).sum::<usize>();
  5457. self.stats
  5458. .num_reclaims
  5459. .fetch_add(reclaims_len as u64, Ordering::Relaxed);
  5460. let purge_stats = PurgeStats::default();
  5461. let mut handle_reclaims_time = Measure::start("handle_reclaims");
  5462. self.handle_reclaims(
  5463. reclaims.iter().flatten(),
  5464. None,
  5465. &HashSet::default(),
  5466. HandleReclaims::ProcessDeadSlots(&purge_stats),
  5467. MarkAccountsObsolete::Yes(slot),
  5468. );
  5469. handle_reclaims_time.stop();
  5470. handle_reclaims_elapsed = handle_reclaims_time.as_us();
  5471. self.stats.num_obsolete_slots_removed.fetch_add(
  5472. purge_stats.num_stored_slots_removed.load(Ordering::Relaxed),
  5473. Ordering::Relaxed,
  5474. );
  5475. self.stats.num_obsolete_bytes_removed.fetch_add(
  5476. purge_stats
  5477. .total_removed_stored_bytes
  5478. .load(Ordering::Relaxed),
  5479. Ordering::Relaxed,
  5480. );
  5481. self.stats
  5482. .store_handle_reclaims
  5483. .fetch_add(handle_reclaims_elapsed, Ordering::Relaxed);
  5484. }
  5485. StoreAccountsTiming {
  5486. store_accounts_elapsed: store_accounts_time.as_us(),
  5487. update_index_elapsed: update_index_time.as_us(),
  5488. handle_reclaims_elapsed,
  5489. }
  5490. }
  5491. fn write_accounts_to_cache<'a, 'b>(
  5492. &self,
  5493. slot: Slot,
  5494. accounts_and_meta_to_store: &impl StorableAccounts<'b>,
  5495. txs: Option<&[&SanitizedTransaction]>,
  5496. ) -> Vec<AccountInfo> {
  5497. let mut current_write_version = if self.accounts_update_notifier.is_some() {
  5498. self.write_version
  5499. .fetch_add(accounts_and_meta_to_store.len() as u64, Ordering::AcqRel)
  5500. } else {
  5501. 0
  5502. };
  5503. (0..accounts_and_meta_to_store.len())
  5504. .map(|index| {
  5505. let txn = txs.map(|txs| *txs.get(index).expect("txs must be present if provided"));
  5506. accounts_and_meta_to_store.account_default_if_zero_lamport(index, |account| {
  5507. let account_shared_data = account.take_account();
  5508. let pubkey = account.pubkey();
  5509. let account_info =
  5510. AccountInfo::new(StorageLocation::Cached, account.is_zero_lamport());
  5511. self.notify_account_at_accounts_update(
  5512. slot,
  5513. &account_shared_data,
  5514. &txn,
  5515. pubkey,
  5516. current_write_version,
  5517. );
  5518. current_write_version = current_write_version.saturating_add(1);
  5519. self.accounts_cache.store(slot, pubkey, account_shared_data);
  5520. account_info
  5521. })
  5522. })
  5523. .collect()
  5524. }
  5525. fn write_accounts_to_storage<'a>(
  5526. &self,
  5527. slot: Slot,
  5528. storage: &AccountStorageEntry,
  5529. accounts_and_meta_to_store: &impl StorableAccounts<'a>,
  5530. ) -> Vec<AccountInfo> {
  5531. let mut infos: Vec<AccountInfo> = Vec::with_capacity(accounts_and_meta_to_store.len());
  5532. let mut total_append_accounts_us = 0;
  5533. while infos.len() < accounts_and_meta_to_store.len() {
  5534. let mut append_accounts = Measure::start("append_accounts");
  5535. let stored_accounts_info = storage
  5536. .accounts
  5537. .write_accounts(accounts_and_meta_to_store, infos.len());
  5538. append_accounts.stop();
  5539. total_append_accounts_us += append_accounts.as_us();
  5540. let Some(stored_accounts_info) = stored_accounts_info else {
  5541. // See if an account overflows the storage in the slot.
  5542. let data_len = accounts_and_meta_to_store.data_len(infos.len());
  5543. let data_len = (data_len + STORE_META_OVERHEAD) as u64;
  5544. if data_len > storage.accounts.remaining_bytes() {
  5545. info!(
  5546. "write_accounts_to_storage, no space: {}, {}, {}, {}, {}",
  5547. storage.accounts.capacity(),
  5548. storage.accounts.remaining_bytes(),
  5549. data_len,
  5550. infos.len(),
  5551. accounts_and_meta_to_store.len()
  5552. );
  5553. let special_store_size = std::cmp::max(data_len * 2, self.file_size);
  5554. self.create_and_insert_store(slot, special_store_size, "large create");
  5555. }
  5556. continue;
  5557. };
  5558. let store_id = storage.id();
  5559. for (i, offset) in stored_accounts_info.offsets.iter().enumerate() {
  5560. infos.push(AccountInfo::new(
  5561. StorageLocation::AppendVec(store_id, *offset),
  5562. accounts_and_meta_to_store.is_zero_lamport(i),
  5563. ));
  5564. }
  5565. storage.add_accounts(
  5566. stored_accounts_info.offsets.len(),
  5567. stored_accounts_info.size,
  5568. );
  5569. }
  5570. self.stats
  5571. .store_append_accounts
  5572. .fetch_add(total_append_accounts_us, Ordering::Relaxed);
  5573. infos
  5574. }
  5575. /// Marks zero lamport single reference accounts in the storage during store_accounts
  5576. fn mark_zero_lamport_single_ref_accounts(
  5577. &self,
  5578. account_infos: &[AccountInfo],
  5579. storage: &AccountStorageEntry,
  5580. reclaim_handling: UpsertReclaim,
  5581. ) {
  5582. // If the reclaim handling is `ReclaimOldSlots`, then all zero lamport accounts are single
  5583. // ref accounts and they need to be inserted into the storages zero lamport single ref
  5584. // accounts list
  5585. // For other values of reclaim handling, there are no zero lamport single ref accounts
  5586. // so nothing needs to be done in this function
  5587. if reclaim_handling == UpsertReclaim::ReclaimOldSlots {
  5588. let mut add_zero_lamport_accounts = Measure::start("add_zero_lamport_accounts");
  5589. let mut num_zero_lamport_accounts_added = 0;
  5590. for account_info in account_infos {
  5591. if account_info.is_zero_lamport() {
  5592. storage.insert_zero_lamport_single_ref_account_offset(account_info.offset());
  5593. num_zero_lamport_accounts_added += 1;
  5594. }
  5595. }
  5596. // If any zero lamport accounts were added, the storage may be valid for shrinking
  5597. if num_zero_lamport_accounts_added > 0
  5598. && self.is_candidate_for_shrink(storage)
  5599. && Self::is_shrinking_productive(storage)
  5600. {
  5601. self.shrink_candidate_slots
  5602. .lock()
  5603. .unwrap()
  5604. .insert(storage.slot);
  5605. }
  5606. add_zero_lamport_accounts.stop();
  5607. self.stats
  5608. .add_zero_lamport_accounts_us
  5609. .fetch_add(add_zero_lamport_accounts.as_us(), Ordering::Relaxed);
  5610. self.stats
  5611. .num_zero_lamport_accounts_added
  5612. .fetch_add(num_zero_lamport_accounts_added, Ordering::Relaxed);
  5613. }
  5614. }
  5615. fn report_store_timings(&self) {
  5616. if self.stats.last_store_report.should_update(1000) {
  5617. let read_cache_stats = self.read_only_accounts_cache.get_and_reset_stats();
  5618. datapoint_info!(
  5619. "accounts_db_store_timings",
  5620. (
  5621. "store_accounts",
  5622. self.stats.store_accounts.swap(0, Ordering::Relaxed),
  5623. i64
  5624. ),
  5625. (
  5626. "update_index",
  5627. self.stats.store_update_index.swap(0, Ordering::Relaxed),
  5628. i64
  5629. ),
  5630. (
  5631. "handle_reclaims",
  5632. self.stats.store_handle_reclaims.swap(0, Ordering::Relaxed),
  5633. i64
  5634. ),
  5635. (
  5636. "append_accounts",
  5637. self.stats.store_append_accounts.swap(0, Ordering::Relaxed),
  5638. i64
  5639. ),
  5640. (
  5641. "stakes_cache_check_and_store_us",
  5642. self.stats
  5643. .stakes_cache_check_and_store_us
  5644. .swap(0, Ordering::Relaxed),
  5645. i64
  5646. ),
  5647. (
  5648. "num_accounts",
  5649. self.stats.store_num_accounts.swap(0, Ordering::Relaxed),
  5650. i64
  5651. ),
  5652. (
  5653. "total_data",
  5654. self.stats.store_total_data.swap(0, Ordering::Relaxed),
  5655. i64
  5656. ),
  5657. (
  5658. "num_reclaims",
  5659. self.stats.num_reclaims.swap(0, Ordering::Relaxed),
  5660. i64
  5661. ),
  5662. (
  5663. "read_only_accounts_cache_entries",
  5664. self.read_only_accounts_cache.cache_len(),
  5665. i64
  5666. ),
  5667. (
  5668. "read_only_accounts_cache_data_size",
  5669. self.read_only_accounts_cache.data_size(),
  5670. i64
  5671. ),
  5672. ("read_only_accounts_cache_hits", read_cache_stats.hits, i64),
  5673. (
  5674. "read_only_accounts_cache_misses",
  5675. read_cache_stats.misses,
  5676. i64
  5677. ),
  5678. (
  5679. "read_only_accounts_cache_evicts",
  5680. read_cache_stats.evicts,
  5681. i64
  5682. ),
  5683. (
  5684. "read_only_accounts_cache_load_us",
  5685. read_cache_stats.load_us,
  5686. i64
  5687. ),
  5688. (
  5689. "read_only_accounts_cache_store_us",
  5690. read_cache_stats.store_us,
  5691. i64
  5692. ),
  5693. (
  5694. "read_only_accounts_cache_evict_us",
  5695. read_cache_stats.evict_us,
  5696. i64
  5697. ),
  5698. (
  5699. "read_only_accounts_cache_evictor_wakeup_count_all",
  5700. read_cache_stats.evictor_wakeup_count_all,
  5701. i64
  5702. ),
  5703. (
  5704. "read_only_accounts_cache_evictor_wakeup_count_productive",
  5705. read_cache_stats.evictor_wakeup_count_productive,
  5706. i64
  5707. ),
  5708. (
  5709. "handle_dead_keys_us",
  5710. self.stats.handle_dead_keys_us.swap(0, Ordering::Relaxed),
  5711. i64
  5712. ),
  5713. (
  5714. "purge_exact_us",
  5715. self.stats.purge_exact_us.swap(0, Ordering::Relaxed),
  5716. i64
  5717. ),
  5718. (
  5719. "purge_exact_count",
  5720. self.stats.purge_exact_count.swap(0, Ordering::Relaxed),
  5721. i64
  5722. ),
  5723. (
  5724. "num_obsolete_slots_removed",
  5725. self.stats
  5726. .num_obsolete_slots_removed
  5727. .swap(0, Ordering::Relaxed),
  5728. i64
  5729. ),
  5730. (
  5731. "num_obsolete_bytes_removed",
  5732. self.stats
  5733. .num_obsolete_bytes_removed
  5734. .swap(0, Ordering::Relaxed),
  5735. i64
  5736. ),
  5737. (
  5738. "add_zero_lamport_accounts_us",
  5739. self.stats
  5740. .add_zero_lamport_accounts_us
  5741. .swap(0, Ordering::Relaxed),
  5742. i64
  5743. ),
  5744. (
  5745. "num_zero_lamport_accounts_added",
  5746. self.stats
  5747. .num_zero_lamport_accounts_added
  5748. .swap(0, Ordering::Relaxed),
  5749. i64
  5750. ),
  5751. );
  5752. datapoint_info!(
  5753. "accounts_db_store_timings2",
  5754. (
  5755. "create_store_count",
  5756. self.stats.create_store_count.swap(0, Ordering::Relaxed),
  5757. i64
  5758. ),
  5759. (
  5760. "dropped_stores",
  5761. self.stats.dropped_stores.swap(0, Ordering::Relaxed),
  5762. i64
  5763. ),
  5764. );
  5765. }
  5766. }
  5767. pub fn add_root(&self, slot: Slot) -> AccountsAddRootTiming {
  5768. let mut index_time = Measure::start("index_add_root");
  5769. self.accounts_index.add_root(slot);
  5770. index_time.stop();
  5771. let mut cache_time = Measure::start("cache_add_root");
  5772. self.accounts_cache.add_root(slot);
  5773. cache_time.stop();
  5774. AccountsAddRootTiming {
  5775. index_us: index_time.as_us(),
  5776. cache_us: cache_time.as_us(),
  5777. }
  5778. }
  5779. /// Returns storages for `requested_slots`
  5780. pub fn get_storages(
  5781. &self,
  5782. requested_slots: impl RangeBounds<Slot> + Sync,
  5783. ) -> (Vec<Arc<AccountStorageEntry>>, Vec<Slot>) {
  5784. let start = Instant::now();
  5785. let (slots, storages) = self
  5786. .storage
  5787. .get_if(|slot, storage| requested_slots.contains(slot) && storage.has_accounts())
  5788. .into_vec()
  5789. .into_iter()
  5790. .unzip();
  5791. let duration = start.elapsed();
  5792. debug!("get_snapshot_storages: {duration:?}");
  5793. (storages, slots)
  5794. }
  5795. /// Returns the latest full snapshot slot
  5796. pub fn latest_full_snapshot_slot(&self) -> Option<Slot> {
  5797. self.latest_full_snapshot_slot.read()
  5798. }
  5799. /// Sets the latest full snapshot slot to `slot`
  5800. pub fn set_latest_full_snapshot_slot(&self, slot: Slot) {
  5801. *self.latest_full_snapshot_slot.lock_write() = Some(slot);
  5802. }
  5803. fn generate_index_for_slot<'a>(
  5804. &self,
  5805. reader: &mut impl RequiredLenBufFileRead<'a>,
  5806. storage: &'a AccountStorageEntry,
  5807. slot: Slot,
  5808. store_id: AccountsFileId,
  5809. storage_info: &StorageSizeAndCountMap,
  5810. ) -> SlotIndexGenerationInfo {
  5811. if storage.accounts.get_account_data_lens(&[0]).is_empty() {
  5812. return SlotIndexGenerationInfo::default();
  5813. }
  5814. let mut accounts_data_len = 0;
  5815. let mut stored_size_alive = 0;
  5816. let mut zero_lamport_pubkeys = vec![];
  5817. let mut zero_lamport_offsets = vec![];
  5818. let mut all_accounts_are_zero_lamports = true;
  5819. let mut slot_lt_hash = SlotLtHash::default();
  5820. let mut keyed_account_infos = vec![];
  5821. let geyser_notifier = self
  5822. .accounts_update_notifier
  5823. .as_ref()
  5824. .filter(|notifier| notifier.snapshot_notifications_enabled());
  5825. // If geyser notifications at startup from snapshot are enabled, we need to pass in a
  5826. // write version for each account notification. This value does not need to be
  5827. // globally unique, as geyser plugins also receive the slot number. We only need to
  5828. // ensure that more recent accounts have a higher write version than older accounts.
  5829. // Even more relaxed, we really only need to have different write versions if there are
  5830. // multiple versions of the same account in a single storage, which is not allowed.
  5831. //
  5832. // Since we scan the storage from oldest to newest, we can simply increment a local
  5833. // counter per account and use that for the write version.
  5834. let mut write_version_for_geyser = 0;
  5835. // Collect all the obsolete accounts in this storage into a hashset for fast lookup.
  5836. // Safe to pass in 'None' which will return all obsolete accounts in this Slot.
  5837. // Any accounts marked obsolete in a slot newer than the snapshot slot were filtered out
  5838. // when the obsolete account data was serialized to disk for fastboot
  5839. let obsolete_accounts: IntSet<_> = storage
  5840. .obsolete_accounts_read_lock()
  5841. .filter_obsolete_accounts(None)
  5842. .map(|(offset, _)| offset)
  5843. .collect();
  5844. let mut num_obsolete_accounts_skipped = 0;
  5845. storage
  5846. .accounts
  5847. .scan_accounts(reader, |offset, account| {
  5848. if obsolete_accounts.contains(&offset) {
  5849. num_obsolete_accounts_skipped += 1;
  5850. return;
  5851. }
  5852. let data_len = account.data.len();
  5853. stored_size_alive += storage.accounts.calculate_stored_size(data_len);
  5854. let is_account_zero_lamport = account.is_zero_lamport();
  5855. if !is_account_zero_lamport {
  5856. accounts_data_len += data_len as u64;
  5857. all_accounts_are_zero_lamports = false;
  5858. } else {
  5859. // With obsolete accounts enabled, all zero lamport accounts
  5860. // are obsolete or single ref by the end of index generation
  5861. // Store the offsets here
  5862. if self.mark_obsolete_accounts == MarkObsoleteAccounts::Enabled {
  5863. zero_lamport_offsets.push(offset);
  5864. }
  5865. zero_lamport_pubkeys.push(*account.pubkey);
  5866. }
  5867. keyed_account_infos.push((
  5868. *account.pubkey,
  5869. AccountInfo::new(
  5870. StorageLocation::AppendVec(store_id, offset), // will never be cached
  5871. is_account_zero_lamport,
  5872. ),
  5873. ));
  5874. if !self.account_indexes.is_empty() {
  5875. self.accounts_index.update_secondary_indexes(
  5876. account.pubkey,
  5877. &account,
  5878. &self.account_indexes,
  5879. );
  5880. }
  5881. let account_lt_hash = Self::lt_hash_account(&account, account.pubkey());
  5882. slot_lt_hash.0.mix_in(&account_lt_hash.0);
  5883. if let Some(geyser_notifier) = geyser_notifier {
  5884. debug_assert!(geyser_notifier.snapshot_notifications_enabled());
  5885. let account_for_geyser = AccountForGeyser {
  5886. pubkey: account.pubkey(),
  5887. lamports: account.lamports(),
  5888. owner: account.owner(),
  5889. executable: account.executable(),
  5890. rent_epoch: account.rent_epoch(),
  5891. data: account.data(),
  5892. };
  5893. geyser_notifier.notify_account_restore_from_snapshot(
  5894. slot,
  5895. write_version_for_geyser,
  5896. &account_for_geyser,
  5897. );
  5898. write_version_for_geyser += 1;
  5899. }
  5900. })
  5901. .expect("must scan accounts storage");
  5902. let (insert_time_us, insert_info) = self
  5903. .accounts_index
  5904. .insert_new_if_missing_into_primary_index(slot, keyed_account_infos);
  5905. {
  5906. // second, collect into the shared DashMap once we've figured out all the info per store_id
  5907. let mut info = storage_info.entry(store_id).or_default();
  5908. info.stored_size += stored_size_alive;
  5909. info.count += insert_info.count;
  5910. // sanity check that stored_size is not larger than the u64 aligned size of the accounts files.
  5911. // Note that the stored_size is aligned, so it can be larger than the size of the accounts file.
  5912. assert!(
  5913. info.stored_size <= u64_align!(storage.accounts.len()),
  5914. "Stored size ({}) is larger than the size of the accounts file ({}) for store_id: \
  5915. {}",
  5916. info.stored_size,
  5917. storage.accounts.len(),
  5918. store_id
  5919. );
  5920. }
  5921. // zero_lamport_pubkeys are candidates for cleaning. So add them to uncleaned_pubkeys
  5922. // for later cleaning. If there is just a single item, there is no cleaning to
  5923. // be done on that pubkey. Use only those pubkeys with multiple updates.
  5924. if !zero_lamport_pubkeys.is_empty() {
  5925. let old = self
  5926. .uncleaned_pubkeys
  5927. .insert(slot, zero_lamport_pubkeys.clone());
  5928. assert!(old.is_none());
  5929. }
  5930. // If obsolete accounts are enabled, add them as single ref accounts here
  5931. // to avoid having to revisit them later
  5932. // This is safe with obsolete accounts as all zero lamport accounts will be single ref
  5933. // or obsolete by the end of index generation
  5934. if self.mark_obsolete_accounts == MarkObsoleteAccounts::Enabled {
  5935. storage.batch_insert_zero_lamport_single_ref_account_offsets(&zero_lamport_offsets);
  5936. zero_lamport_pubkeys = Vec::new();
  5937. }
  5938. SlotIndexGenerationInfo {
  5939. insert_time_us,
  5940. num_accounts: insert_info.count as u64,
  5941. accounts_data_len,
  5942. zero_lamport_pubkeys,
  5943. all_accounts_are_zero_lamports,
  5944. num_did_not_exist: insert_info.num_did_not_exist,
  5945. num_existed_in_mem: insert_info.num_existed_in_mem,
  5946. num_existed_on_disk: insert_info.num_existed_on_disk,
  5947. slot_lt_hash,
  5948. num_obsolete_accounts_skipped,
  5949. }
  5950. }
  5951. pub fn generate_index(
  5952. &self,
  5953. limit_load_slot_count_from_snapshot: Option<usize>,
  5954. verify: bool,
  5955. ) -> IndexGenerationInfo {
  5956. let mut total_time = Measure::start("generate_index");
  5957. let mut storages = self.storage.all_storages();
  5958. storages.sort_unstable_by_key(|storage| storage.slot);
  5959. if let Some(limit) = limit_load_slot_count_from_snapshot {
  5960. storages.truncate(limit); // get rid of the newer slots and keep just the older
  5961. }
  5962. let num_storages = storages.len();
  5963. self.accounts_index
  5964. .set_startup(Startup::StartupWithExtraThreads);
  5965. let storage_info = StorageSizeAndCountMap::default();
  5966. /// Accumulator for the values produced while generating the index
  5967. #[derive(Debug)]
  5968. struct IndexGenerationAccumulator {
  5969. insert_us: u64,
  5970. num_accounts: u64,
  5971. accounts_data_len: u64,
  5972. zero_lamport_pubkeys: Vec<Pubkey>,
  5973. all_accounts_are_zero_lamports_slots: u64,
  5974. all_zeros_slots: Vec<(Slot, Arc<AccountStorageEntry>)>,
  5975. num_did_not_exist: u64,
  5976. num_existed_in_mem: u64,
  5977. num_existed_on_disk: u64,
  5978. lt_hash: LtHash,
  5979. num_obsolete_accounts_skipped: u64,
  5980. }
  5981. impl IndexGenerationAccumulator {
  5982. const fn new() -> Self {
  5983. Self {
  5984. insert_us: 0,
  5985. num_accounts: 0,
  5986. accounts_data_len: 0,
  5987. zero_lamport_pubkeys: Vec::new(),
  5988. all_accounts_are_zero_lamports_slots: 0,
  5989. all_zeros_slots: Vec::new(),
  5990. num_did_not_exist: 0,
  5991. num_existed_in_mem: 0,
  5992. num_existed_on_disk: 0,
  5993. lt_hash: LtHash::identity(),
  5994. num_obsolete_accounts_skipped: 0,
  5995. }
  5996. }
  5997. fn accumulate(&mut self, other: Self) {
  5998. self.insert_us += other.insert_us;
  5999. self.num_accounts += other.num_accounts;
  6000. self.accounts_data_len += other.accounts_data_len;
  6001. self.zero_lamport_pubkeys.extend(other.zero_lamport_pubkeys);
  6002. self.all_accounts_are_zero_lamports_slots +=
  6003. other.all_accounts_are_zero_lamports_slots;
  6004. self.all_zeros_slots.extend(other.all_zeros_slots);
  6005. self.num_did_not_exist += other.num_did_not_exist;
  6006. self.num_existed_in_mem += other.num_existed_in_mem;
  6007. self.num_existed_on_disk += other.num_existed_on_disk;
  6008. self.lt_hash.mix_in(&other.lt_hash);
  6009. self.num_obsolete_accounts_skipped += other.num_obsolete_accounts_skipped;
  6010. }
  6011. }
  6012. let mut total_accum = IndexGenerationAccumulator::new();
  6013. let storages_orderer =
  6014. AccountStoragesOrderer::with_random_order(&storages).into_concurrent_consumer();
  6015. let exit_logger = AtomicBool::new(false);
  6016. let num_processed = AtomicU64::new(0);
  6017. let num_threads = num_cpus::get();
  6018. let mut index_time = Measure::start("index");
  6019. thread::scope(|s| {
  6020. let thread_handles = (0..num_threads)
  6021. .map(|i| {
  6022. thread::Builder::new()
  6023. .name(format!("solGenIndex{i:02}"))
  6024. .spawn_scoped(s, || {
  6025. let mut thread_accum = IndexGenerationAccumulator::new();
  6026. let mut reader = append_vec::new_scan_accounts_reader();
  6027. while let Some(next_item) = storages_orderer.next() {
  6028. self.maybe_throttle_index_generation();
  6029. let storage = next_item.storage;
  6030. let store_id = storage.id();
  6031. let slot = storage.slot();
  6032. let slot_info = self.generate_index_for_slot(
  6033. &mut reader,
  6034. storage,
  6035. slot,
  6036. store_id,
  6037. &storage_info,
  6038. );
  6039. thread_accum.insert_us += slot_info.insert_time_us;
  6040. thread_accum.num_accounts += slot_info.num_accounts;
  6041. thread_accum.accounts_data_len += slot_info.accounts_data_len;
  6042. thread_accum
  6043. .zero_lamport_pubkeys
  6044. .extend(slot_info.zero_lamport_pubkeys);
  6045. if slot_info.all_accounts_are_zero_lamports {
  6046. thread_accum.all_accounts_are_zero_lamports_slots += 1;
  6047. thread_accum.all_zeros_slots.push((
  6048. slot,
  6049. Arc::clone(&storages[next_item.original_index]),
  6050. ));
  6051. }
  6052. thread_accum.num_did_not_exist += slot_info.num_did_not_exist;
  6053. thread_accum.num_existed_in_mem += slot_info.num_existed_in_mem;
  6054. thread_accum.num_existed_on_disk += slot_info.num_existed_on_disk;
  6055. thread_accum.lt_hash.mix_in(&slot_info.slot_lt_hash.0);
  6056. thread_accum.num_obsolete_accounts_skipped +=
  6057. slot_info.num_obsolete_accounts_skipped;
  6058. num_processed.fetch_add(1, Ordering::Relaxed);
  6059. }
  6060. thread_accum
  6061. })
  6062. })
  6063. .collect::<Result<Vec<_>, _>>()
  6064. .expect("spawn threads");
  6065. let logger_thread_handle = thread::Builder::new()
  6066. .name("solGenIndexLog".to_string())
  6067. .spawn_scoped(s, || {
  6068. let mut last_update = Instant::now();
  6069. loop {
  6070. if exit_logger.load(Ordering::Relaxed) {
  6071. break;
  6072. }
  6073. let num_processed = num_processed.load(Ordering::Relaxed);
  6074. if num_processed == num_storages as u64 {
  6075. info!("generating index: processed all slots");
  6076. break;
  6077. }
  6078. let now = Instant::now();
  6079. if now - last_update > Duration::from_secs(2) {
  6080. info!(
  6081. "generating index: processed {num_processed}/{num_storages} \
  6082. slots..."
  6083. );
  6084. last_update = now;
  6085. }
  6086. thread::sleep(Duration::from_millis(500))
  6087. }
  6088. })
  6089. .expect("spawn thread");
  6090. for thread_handle in thread_handles {
  6091. let Ok(thread_accum) = thread_handle.join() else {
  6092. exit_logger.store(true, Ordering::Relaxed);
  6093. panic!("index generation failed");
  6094. };
  6095. total_accum.accumulate(thread_accum);
  6096. }
  6097. // Make sure to join the logger thread *after* the main threads.
  6098. // This way, if a main thread errors, we won't spin indefinitely
  6099. // waiting for the logger thread to finish (it never will).
  6100. logger_thread_handle.join().expect("join thread");
  6101. });
  6102. index_time.stop();
  6103. {
  6104. // Update the index stats now.
  6105. let index_stats = self.accounts_index.stats();
  6106. // stats for inserted entries that previously did *not* exist
  6107. index_stats.inc_insert_count(total_accum.num_did_not_exist);
  6108. index_stats.add_mem_count(total_accum.num_did_not_exist as usize);
  6109. // stats for inserted entries that previous did exist *in-mem*
  6110. index_stats
  6111. .entries_from_mem
  6112. .fetch_add(total_accum.num_existed_in_mem, Ordering::Relaxed);
  6113. index_stats
  6114. .updates_in_mem
  6115. .fetch_add(total_accum.num_existed_in_mem, Ordering::Relaxed);
  6116. // stats for inserted entries that previously did exist *on-disk*
  6117. index_stats.add_mem_count(total_accum.num_existed_on_disk as usize);
  6118. index_stats
  6119. .entries_missing
  6120. .fetch_add(total_accum.num_existed_on_disk, Ordering::Relaxed);
  6121. index_stats
  6122. .updates_in_mem
  6123. .fetch_add(total_accum.num_existed_on_disk, Ordering::Relaxed);
  6124. }
  6125. if let Some(geyser_notifier) = &self.accounts_update_notifier {
  6126. // We've finished scanning all the storages, and have thus sent all the
  6127. // account notifications. Now, let the geyser plugins know we're done.
  6128. geyser_notifier.notify_end_of_restore_from_snapshot();
  6129. }
  6130. if verify {
  6131. info!("Verifying index...");
  6132. let start = Instant::now();
  6133. storages.par_iter().for_each(|storage| {
  6134. let store_id = storage.id();
  6135. let slot = storage.slot();
  6136. storage
  6137. .accounts
  6138. .scan_accounts_without_data(|offset, account| {
  6139. let key = account.pubkey();
  6140. self.accounts_index.get_and_then(key, |entry| {
  6141. let index_entry = entry.unwrap();
  6142. let slot_list = index_entry.slot_list_read_lock();
  6143. let mut count = 0;
  6144. for (slot2, account_info2) in slot_list.iter() {
  6145. if *slot2 == slot {
  6146. count += 1;
  6147. let ai = AccountInfo::new(
  6148. StorageLocation::AppendVec(store_id, offset), // will never be cached
  6149. account.is_zero_lamport(),
  6150. );
  6151. assert_eq!(&ai, account_info2);
  6152. }
  6153. }
  6154. assert_eq!(1, count);
  6155. (false, ())
  6156. });
  6157. })
  6158. .expect("must scan accounts storage");
  6159. });
  6160. info!("Verifying index... Done in {:?}", start.elapsed());
  6161. }
  6162. let total_duplicate_slot_keys = AtomicU64::default();
  6163. let total_num_unique_duplicate_keys = AtomicU64::default();
  6164. // outer vec is accounts index bin (determined by pubkey value)
  6165. // inner vec is the pubkeys within that bin that are present in > 1 slot
  6166. let unique_pubkeys_by_bin = Mutex::new(Vec::<Vec<Pubkey>>::default());
  6167. // tell accounts index we are done adding the initial accounts at startup
  6168. let mut m = Measure::start("accounts_index_idle_us");
  6169. self.accounts_index.set_startup(Startup::Normal);
  6170. m.stop();
  6171. let index_flush_us = m.as_us();
  6172. let populate_duplicate_keys_us = measure_us!({
  6173. // this has to happen before visit_duplicate_pubkeys_during_startup below
  6174. // get duplicate keys from acct idx. We have to wait until we've finished flushing.
  6175. self.accounts_index
  6176. .populate_and_retrieve_duplicate_keys_from_startup(|slot_keys| {
  6177. total_duplicate_slot_keys.fetch_add(slot_keys.len() as u64, Ordering::Relaxed);
  6178. let unique_keys =
  6179. HashSet::<Pubkey>::from_iter(slot_keys.iter().map(|(_, key)| *key));
  6180. for (slot, key) in slot_keys {
  6181. self.uncleaned_pubkeys.entry(slot).or_default().push(key);
  6182. }
  6183. let unique_pubkeys_by_bin_inner = unique_keys.into_iter().collect::<Vec<_>>();
  6184. total_num_unique_duplicate_keys
  6185. .fetch_add(unique_pubkeys_by_bin_inner.len() as u64, Ordering::Relaxed);
  6186. // does not matter that this is not ordered by slot
  6187. unique_pubkeys_by_bin
  6188. .lock()
  6189. .unwrap()
  6190. .push(unique_pubkeys_by_bin_inner);
  6191. });
  6192. })
  6193. .1;
  6194. let unique_pubkeys_by_bin = unique_pubkeys_by_bin.into_inner().unwrap();
  6195. let mut timings = GenerateIndexTimings {
  6196. index_flush_us,
  6197. index_time: index_time.as_us(),
  6198. insertion_time_us: total_accum.insert_us,
  6199. total_duplicate_slot_keys: total_duplicate_slot_keys.load(Ordering::Relaxed),
  6200. total_num_unique_duplicate_keys: total_num_unique_duplicate_keys
  6201. .load(Ordering::Relaxed),
  6202. populate_duplicate_keys_us,
  6203. total_including_duplicates: total_accum.num_accounts,
  6204. total_slots: num_storages as u64,
  6205. all_accounts_are_zero_lamports_slots: total_accum.all_accounts_are_zero_lamports_slots,
  6206. num_obsolete_accounts_skipped: total_accum.num_obsolete_accounts_skipped,
  6207. ..GenerateIndexTimings::default()
  6208. };
  6209. #[derive(Debug, Default)]
  6210. struct DuplicatePubkeysVisitedInfo {
  6211. accounts_data_len_from_duplicates: u64,
  6212. num_duplicate_accounts: u64,
  6213. duplicates_lt_hash: Box<DuplicatesLtHash>,
  6214. }
  6215. impl DuplicatePubkeysVisitedInfo {
  6216. fn reduce(mut self, other: Self) -> Self {
  6217. self.accounts_data_len_from_duplicates += other.accounts_data_len_from_duplicates;
  6218. self.num_duplicate_accounts += other.num_duplicate_accounts;
  6219. self.duplicates_lt_hash
  6220. .0
  6221. .mix_in(&other.duplicates_lt_hash.0);
  6222. self
  6223. }
  6224. }
  6225. let (num_zero_lamport_single_refs, visit_zero_lamports_us) = measure_us!(
  6226. self.visit_zero_lamport_pubkeys_during_startup(total_accum.zero_lamport_pubkeys)
  6227. );
  6228. timings.visit_zero_lamports_us = visit_zero_lamports_us;
  6229. timings.num_zero_lamport_single_refs = num_zero_lamport_single_refs;
  6230. let mut visit_duplicate_accounts_timer = Measure::start("visit duplicate accounts");
  6231. let DuplicatePubkeysVisitedInfo {
  6232. accounts_data_len_from_duplicates,
  6233. num_duplicate_accounts,
  6234. duplicates_lt_hash,
  6235. } = unique_pubkeys_by_bin
  6236. .par_iter()
  6237. .fold(
  6238. DuplicatePubkeysVisitedInfo::default,
  6239. |accum, pubkeys_by_bin| {
  6240. let intermediate = pubkeys_by_bin
  6241. .par_chunks(4096)
  6242. .fold(DuplicatePubkeysVisitedInfo::default, |accum, pubkeys| {
  6243. let (
  6244. accounts_data_len_from_duplicates,
  6245. accounts_duplicates_num,
  6246. duplicates_lt_hash,
  6247. ) = self.visit_duplicate_pubkeys_during_startup(pubkeys);
  6248. let intermediate = DuplicatePubkeysVisitedInfo {
  6249. accounts_data_len_from_duplicates,
  6250. num_duplicate_accounts: accounts_duplicates_num,
  6251. duplicates_lt_hash,
  6252. };
  6253. DuplicatePubkeysVisitedInfo::reduce(accum, intermediate)
  6254. })
  6255. .reduce(
  6256. DuplicatePubkeysVisitedInfo::default,
  6257. DuplicatePubkeysVisitedInfo::reduce,
  6258. );
  6259. DuplicatePubkeysVisitedInfo::reduce(accum, intermediate)
  6260. },
  6261. )
  6262. .reduce(
  6263. DuplicatePubkeysVisitedInfo::default,
  6264. DuplicatePubkeysVisitedInfo::reduce,
  6265. );
  6266. visit_duplicate_accounts_timer.stop();
  6267. timings.visit_duplicate_accounts_time_us = visit_duplicate_accounts_timer.as_us();
  6268. timings.num_duplicate_accounts = num_duplicate_accounts;
  6269. total_accum.lt_hash.mix_out(&duplicates_lt_hash.0);
  6270. total_accum.accounts_data_len -= accounts_data_len_from_duplicates;
  6271. info!("accounts data len: {}", total_accum.accounts_data_len);
  6272. // insert all zero lamport account storage into the dirty stores and add them into the uncleaned roots for clean to pick up
  6273. info!(
  6274. "insert all zero slots to clean at startup {}",
  6275. total_accum.all_zeros_slots.len()
  6276. );
  6277. for (slot, storage) in total_accum.all_zeros_slots {
  6278. self.dirty_stores.insert(slot, storage);
  6279. }
  6280. // Need to add these last, otherwise older updates will be cleaned
  6281. for storage in &storages {
  6282. self.accounts_index.add_root(storage.slot());
  6283. }
  6284. self.set_storage_count_and_alive_bytes(storage_info, &mut timings);
  6285. if self.mark_obsolete_accounts == MarkObsoleteAccounts::Enabled {
  6286. let mut mark_obsolete_accounts_time = Measure::start("mark_obsolete_accounts_time");
  6287. // Mark all reclaims at max_slot. This is safe because only the snapshot paths care about
  6288. // this information. Since this account was just restored from the previous snapshot and
  6289. // it is known that it was already obsolete at that time, it must hold true that it will
  6290. // still be obsolete if a newer snapshot is created, since a newer snapshot will always
  6291. // be performed on a slot greater than the current slot
  6292. let slot_marked_obsolete = storages.last().unwrap().slot();
  6293. let obsolete_account_stats =
  6294. self.mark_obsolete_accounts_at_startup(slot_marked_obsolete, unique_pubkeys_by_bin);
  6295. mark_obsolete_accounts_time.stop();
  6296. timings.mark_obsolete_accounts_us = mark_obsolete_accounts_time.as_us();
  6297. timings.num_obsolete_accounts_marked = obsolete_account_stats.accounts_marked_obsolete;
  6298. timings.num_slots_removed_as_obsolete = obsolete_account_stats.slots_removed;
  6299. }
  6300. total_time.stop();
  6301. timings.total_time_us = total_time.as_us();
  6302. timings.report(self.accounts_index.get_startup_stats());
  6303. self.accounts_index.log_secondary_indexes();
  6304. // Now that the index is generated, get the total capacity of the in-mem maps
  6305. // across all the bins and set the initial value for the stat.
  6306. // We do this all at once, at the end, since getting the capacity requries iterating all
  6307. // the bins and grabbing a read lock, which we try to avoid whenever possible.
  6308. let index_capacity = self
  6309. .accounts_index
  6310. .account_maps
  6311. .iter()
  6312. .map(|bin| bin.capacity_for_startup())
  6313. .sum();
  6314. self.accounts_index
  6315. .stats()
  6316. .capacity_in_mem
  6317. .store(index_capacity, Ordering::Relaxed);
  6318. IndexGenerationInfo {
  6319. accounts_data_len: total_accum.accounts_data_len,
  6320. calculated_accounts_lt_hash: AccountsLtHash(total_accum.lt_hash),
  6321. }
  6322. }
  6323. /// Use the duplicated pubkeys to mark all older version of the pubkeys as obsolete
  6324. /// This will unref the accounts and then reclaim the accounts
  6325. fn mark_obsolete_accounts_at_startup(
  6326. &self,
  6327. slot_marked_obsolete: Slot,
  6328. pubkeys_with_duplicates_by_bin: Vec<Vec<Pubkey>>,
  6329. ) -> ObsoleteAccountsStats {
  6330. let stats: ObsoleteAccountsStats = pubkeys_with_duplicates_by_bin
  6331. .par_iter()
  6332. .map(|pubkeys_by_bin| {
  6333. let reclaims = self
  6334. .accounts_index
  6335. .clean_and_unref_rooted_entries_by_bin(pubkeys_by_bin);
  6336. let stats = PurgeStats::default();
  6337. // Mark all the entries as obsolete, and remove any empty storages
  6338. if !reclaims.is_empty() {
  6339. self.handle_reclaims(
  6340. reclaims.iter(),
  6341. None,
  6342. &HashSet::new(),
  6343. HandleReclaims::ProcessDeadSlots(&stats),
  6344. MarkAccountsObsolete::Yes(slot_marked_obsolete),
  6345. );
  6346. }
  6347. ObsoleteAccountsStats {
  6348. accounts_marked_obsolete: reclaims.len() as u64,
  6349. slots_removed: stats.total_removed_storage_entries.load(Ordering::Relaxed)
  6350. as u64,
  6351. }
  6352. })
  6353. .sum();
  6354. stats
  6355. }
  6356. /// Startup processes can consume large amounts of memory while inserting accounts into the index as fast as possible.
  6357. /// Calling this can slow down the insertion process to allow flushing to disk to keep pace.
  6358. fn maybe_throttle_index_generation(&self) {
  6359. // Only throttle if we are generating on-disk index. Throttling is not needed for in-mem index.
  6360. if !self.accounts_index.is_disk_index_enabled() {
  6361. return;
  6362. }
  6363. // This number is chosen to keep the initial ram usage sufficiently small
  6364. // The process of generating the index is governed entirely by how fast the disk index can be populated.
  6365. // 10M accounts is sufficiently small that it will never have memory usage. It seems sufficiently large that it will provide sufficient performance.
  6366. // Performance is measured by total time to generate the index.
  6367. // Just estimating - 150M accounts can easily be held in memory in the accounts index on a 256G machine. 2-300M are also likely 'fine' during startup.
  6368. // 550M was straining a 384G machine at startup.
  6369. // This is a tunable parameter that just needs to be small enough to keep the generation threads from overwhelming RAM and oom at startup.
  6370. const LIMIT: usize = 10_000_000;
  6371. while self
  6372. .accounts_index
  6373. .get_startup_remaining_items_to_flush_estimate()
  6374. > LIMIT
  6375. {
  6376. // 10 ms is long enough to allow some flushing to occur before insertion is resumed.
  6377. // callers of this are typically run in parallel, so many threads will be sleeping at different starting intervals, waiting to resume insertion.
  6378. sleep(Duration::from_millis(10));
  6379. }
  6380. }
  6381. /// Visit zero lamport pubkeys and populate zero_lamport_single_ref info on
  6382. /// storage.
  6383. /// Returns the number of zero lamport single ref accounts found.
  6384. fn visit_zero_lamport_pubkeys_during_startup(&self, mut pubkeys: Vec<Pubkey>) -> u64 {
  6385. let mut slot_offsets = HashMap::<_, Vec<_>>::default();
  6386. // sort the pubkeys first so that in scan, the pubkeys are visited in
  6387. // index bucket in order. This helps to reduce the page faults and speed
  6388. // up the scan compared to visiting the pubkeys in random order.
  6389. let orig_len = pubkeys.len();
  6390. pubkeys.sort_unstable();
  6391. pubkeys.dedup();
  6392. let uniq_len = pubkeys.len();
  6393. info!(
  6394. "visit_zero_lamport_pubkeys_during_startup: {orig_len} pubkeys, {uniq_len} after dedup",
  6395. );
  6396. self.accounts_index.scan(
  6397. pubkeys.iter(),
  6398. |_pubkey, slots_refs| {
  6399. let (slot_list, ref_count) = slots_refs.unwrap();
  6400. if ref_count == 1 {
  6401. assert_eq!(slot_list.len(), 1);
  6402. let (slot_alive, account_info) = slot_list.first().unwrap();
  6403. assert!(!account_info.is_cached());
  6404. if account_info.is_zero_lamport() {
  6405. slot_offsets
  6406. .entry(*slot_alive)
  6407. .or_default()
  6408. .push(account_info.offset());
  6409. }
  6410. }
  6411. AccountsIndexScanResult::OnlyKeepInMemoryIfDirty
  6412. },
  6413. None,
  6414. ScanFilter::All,
  6415. );
  6416. let mut count = 0;
  6417. let mut dead_stores = 0;
  6418. let mut shrink_stores = 0;
  6419. let mut non_shrink_stores = 0;
  6420. for (slot, offsets) in slot_offsets {
  6421. if let Some(store) = self.storage.get_slot_storage_entry(slot) {
  6422. count += store.batch_insert_zero_lamport_single_ref_account_offsets(&offsets);
  6423. if store.num_zero_lamport_single_ref_accounts() == store.count() {
  6424. // all accounts in this storage can be dead
  6425. self.dirty_stores.entry(slot).or_insert(store);
  6426. dead_stores += 1;
  6427. } else if Self::is_shrinking_productive(&store)
  6428. && self.is_candidate_for_shrink(&store)
  6429. {
  6430. // this store might be eligible for shrinking now
  6431. if self.shrink_candidate_slots.lock().unwrap().insert(slot) {
  6432. shrink_stores += 1;
  6433. }
  6434. } else {
  6435. non_shrink_stores += 1;
  6436. }
  6437. }
  6438. }
  6439. self.shrink_stats
  6440. .num_zero_lamport_single_ref_accounts_found
  6441. .fetch_add(count, Ordering::Relaxed);
  6442. self.shrink_stats
  6443. .num_dead_slots_added_to_clean
  6444. .fetch_add(dead_stores, Ordering::Relaxed);
  6445. self.shrink_stats
  6446. .num_slots_with_zero_lamport_accounts_added_to_shrink
  6447. .fetch_add(shrink_stores, Ordering::Relaxed);
  6448. self.shrink_stats
  6449. .marking_zero_dead_accounts_in_non_shrinkable_store
  6450. .fetch_add(non_shrink_stores, Ordering::Relaxed);
  6451. count
  6452. }
  6453. /// Used during generate_index() to:
  6454. /// 1. get the _duplicate_ accounts data len from the given pubkeys
  6455. /// 2. get the slots that contained duplicate pubkeys
  6456. /// 3. build up the duplicates lt hash
  6457. ///
  6458. /// Note this should only be used when ALL entries in the accounts index are roots.
  6459. ///
  6460. /// returns tuple of:
  6461. /// - data len sum of all older duplicates
  6462. /// - number of duplicate accounts
  6463. /// - lt hash of duplicates
  6464. fn visit_duplicate_pubkeys_during_startup(
  6465. &self,
  6466. pubkeys: &[Pubkey],
  6467. ) -> (u64, u64, Box<DuplicatesLtHash>) {
  6468. let mut accounts_data_len_from_duplicates = 0;
  6469. let mut num_duplicate_accounts = 0_u64;
  6470. let mut duplicates_lt_hash = Box::new(DuplicatesLtHash::default());
  6471. self.accounts_index.scan(
  6472. pubkeys.iter(),
  6473. |pubkey, slots_refs| {
  6474. if let Some((slot_list, _ref_count)) = slots_refs {
  6475. if slot_list.len() > 1 {
  6476. // Only the account data len in the highest slot should be used, and the rest are
  6477. // duplicates. So find the max slot to keep.
  6478. // Then sum up the remaining data len, which are the duplicates.
  6479. // All of the slots need to go in the 'uncleaned_slots' list. For clean to work properly,
  6480. // the slot where duplicate accounts are found in the index need to be in 'uncleaned_slots' list, too.
  6481. let max = slot_list.iter().map(|(slot, _)| slot).max().unwrap();
  6482. slot_list.iter().for_each(|(slot, account_info)| {
  6483. if slot == max {
  6484. // the info in 'max' is the most recent, current info for this pubkey
  6485. return;
  6486. }
  6487. let maybe_storage_entry = self
  6488. .storage
  6489. .get_account_storage_entry(*slot, account_info.store_id());
  6490. let mut accessor = LoadedAccountAccessor::Stored(
  6491. maybe_storage_entry.map(|entry| (entry, account_info.offset())),
  6492. );
  6493. accessor.check_and_get_loaded_account(|loaded_account| {
  6494. let data_len = loaded_account.data_len();
  6495. if loaded_account.lamports() > 0 {
  6496. accounts_data_len_from_duplicates += data_len;
  6497. }
  6498. num_duplicate_accounts += 1;
  6499. let account_lt_hash =
  6500. Self::lt_hash_account(&loaded_account, pubkey);
  6501. duplicates_lt_hash.0.mix_in(&account_lt_hash.0);
  6502. });
  6503. });
  6504. }
  6505. }
  6506. AccountsIndexScanResult::OnlyKeepInMemoryIfDirty
  6507. },
  6508. None,
  6509. ScanFilter::All,
  6510. );
  6511. (
  6512. accounts_data_len_from_duplicates as u64,
  6513. num_duplicate_accounts,
  6514. duplicates_lt_hash,
  6515. )
  6516. }
  6517. fn set_storage_count_and_alive_bytes(
  6518. &self,
  6519. stored_sizes_and_counts: StorageSizeAndCountMap,
  6520. timings: &mut GenerateIndexTimings,
  6521. ) {
  6522. // store count and size for each storage
  6523. let mut storage_size_storages_time = Measure::start("storage_size_storages");
  6524. for (_slot, store) in self.storage.iter() {
  6525. let id = store.id();
  6526. // Should be default at this point
  6527. assert_eq!(store.alive_bytes(), 0);
  6528. if let Some(entry) = stored_sizes_and_counts.get(&id) {
  6529. trace!(
  6530. "id: {} setting count: {} cur: {}",
  6531. id,
  6532. entry.count,
  6533. store.count(),
  6534. );
  6535. {
  6536. let prev_count = store.count.swap(entry.count, Ordering::Release);
  6537. assert_eq!(prev_count, 0);
  6538. }
  6539. store
  6540. .alive_bytes
  6541. .store(entry.stored_size, Ordering::Release);
  6542. } else {
  6543. trace!("id: {id} clearing count");
  6544. store.count.store(0, Ordering::Release);
  6545. }
  6546. }
  6547. storage_size_storages_time.stop();
  6548. timings.storage_size_storages_us = storage_size_storages_time.as_us();
  6549. }
  6550. pub fn print_accounts_stats(&self, label: &str) {
  6551. self.print_index(label);
  6552. self.print_count_and_status(label);
  6553. }
  6554. fn print_index(&self, label: &str) {
  6555. let mut alive_roots: Vec<_> = self.accounts_index.all_alive_roots();
  6556. #[allow(clippy::stable_sort_primitive)]
  6557. alive_roots.sort();
  6558. info!("{label}: accounts_index alive_roots: {alive_roots:?}");
  6559. self.accounts_index.account_maps.iter().for_each(|map| {
  6560. for pubkey in map.keys() {
  6561. self.accounts_index.get_and_then(&pubkey, |account_entry| {
  6562. if let Some(account_entry) = account_entry {
  6563. let list_r = account_entry.slot_list_read_lock();
  6564. info!(" key: {} ref_count: {}", pubkey, account_entry.ref_count(),);
  6565. info!(" slots: {list_r:?}");
  6566. }
  6567. let add_to_in_mem_cache = false;
  6568. (add_to_in_mem_cache, ())
  6569. });
  6570. }
  6571. });
  6572. }
  6573. pub fn print_count_and_status(&self, label: &str) {
  6574. let mut slots: Vec<_> = self.storage.all_slots();
  6575. #[allow(clippy::stable_sort_primitive)]
  6576. slots.sort();
  6577. info!("{}: count_and status for {} slots:", label, slots.len());
  6578. for slot in &slots {
  6579. let entry = self.storage.get_slot_storage_entry(*slot).unwrap();
  6580. info!(
  6581. " slot: {} id: {} count: {} len: {} capacity: {}",
  6582. slot,
  6583. entry.id(),
  6584. entry.count(),
  6585. entry.accounts.len(),
  6586. entry.accounts.capacity(),
  6587. );
  6588. }
  6589. }
  6590. }
  6591. #[derive(Debug, Copy, Clone)]
  6592. enum HandleReclaims<'a> {
  6593. ProcessDeadSlots(&'a PurgeStats),
  6594. }
  6595. /// Specify whether obsolete accounts should be marked or not during reclaims
  6596. /// They should only be marked if they are also getting unreffed in the index
  6597. /// Temporarily allow dead code until the feature is implemented
  6598. #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  6599. enum MarkAccountsObsolete {
  6600. Yes(Slot),
  6601. No,
  6602. }
  6603. pub enum UpdateIndexThreadSelection {
  6604. /// Use current thread only
  6605. Inline,
  6606. /// Use a thread-pool if the number of updates exceeds a threshold
  6607. PoolWithThreshold,
  6608. }
  6609. // These functions/fields are only usable from a dev context (i.e. tests and benches)
  6610. #[cfg(feature = "dev-context-only-utils")]
  6611. impl AccountStorageEntry {
  6612. fn accounts_count(&self) -> usize {
  6613. let mut count = 0;
  6614. self.accounts
  6615. .scan_pubkeys(|_| {
  6616. count += 1;
  6617. })
  6618. .expect("must scan accounts storage");
  6619. count
  6620. }
  6621. }
  6622. #[cfg(test)]
  6623. impl AccountStorageEntry {
  6624. // Function to modify the list in the account storage entry directly. Only intended for use in testing
  6625. pub(crate) fn obsolete_accounts(&self) -> &RwLock<ObsoleteAccounts> {
  6626. &self.obsolete_accounts
  6627. }
  6628. }
  6629. // These functions/fields are only usable from a dev context (i.e. tests and benches)
  6630. #[cfg(feature = "dev-context-only-utils")]
  6631. impl AccountsDb {
  6632. pub fn default_for_tests() -> Self {
  6633. Self::new_single_for_tests()
  6634. }
  6635. pub fn new_single_for_tests() -> Self {
  6636. AccountsDb::new_for_tests(Vec::new())
  6637. }
  6638. pub fn new_single_for_tests_with_provider_and_config(
  6639. file_provider: AccountsFileProvider,
  6640. accounts_db_config: AccountsDbConfig,
  6641. ) -> Self {
  6642. AccountsDb::new_for_tests_with_provider_and_config(
  6643. Vec::new(),
  6644. file_provider,
  6645. accounts_db_config,
  6646. )
  6647. }
  6648. pub fn new_for_tests(paths: Vec<PathBuf>) -> Self {
  6649. Self::new_for_tests_with_provider_and_config(
  6650. paths,
  6651. AccountsFileProvider::default(),
  6652. ACCOUNTS_DB_CONFIG_FOR_TESTING,
  6653. )
  6654. }
  6655. fn new_for_tests_with_provider_and_config(
  6656. paths: Vec<PathBuf>,
  6657. accounts_file_provider: AccountsFileProvider,
  6658. accounts_db_config: AccountsDbConfig,
  6659. ) -> Self {
  6660. let mut db = AccountsDb::new_with_config(paths, accounts_db_config, None, Arc::default());
  6661. db.accounts_file_provider = accounts_file_provider;
  6662. db
  6663. }
  6664. /// Return the number of slots marked with uncleaned pubkeys.
  6665. /// This is useful for testing clean algorithms.
  6666. pub fn get_len_of_slots_with_uncleaned_pubkeys(&self) -> usize {
  6667. self.uncleaned_pubkeys.len()
  6668. }
  6669. #[cfg(test)]
  6670. pub fn storage_access(&self) -> StorageAccess {
  6671. self.storage_access
  6672. }
  6673. /// Call clean_accounts() with the common parameters that tests/benches use.
  6674. pub fn clean_accounts_for_tests(&self) {
  6675. self.clean_accounts(None, false, &EpochSchedule::default())
  6676. }
  6677. pub fn flush_accounts_cache_slot_for_tests(&self, slot: Slot) {
  6678. self.flush_slot_cache(slot);
  6679. }
  6680. /// useful to adapt tests written prior to introduction of the write cache
  6681. /// to use the write cache
  6682. pub fn add_root_and_flush_write_cache(&self, slot: Slot) {
  6683. self.add_root(slot);
  6684. self.flush_root_write_cache(slot);
  6685. }
  6686. pub fn load_without_fixed_root(
  6687. &self,
  6688. ancestors: &Ancestors,
  6689. pubkey: &Pubkey,
  6690. ) -> Option<(AccountSharedData, Slot)> {
  6691. self.do_load(
  6692. ancestors,
  6693. pubkey,
  6694. None,
  6695. LoadHint::Unspecified,
  6696. // callers of this expect zero lamport accounts that exist in the index to be returned as Some(empty)
  6697. LoadZeroLamports::SomeWithZeroLamportAccountForTests,
  6698. )
  6699. }
  6700. pub fn assert_load_account(&self, slot: Slot, pubkey: Pubkey, expected_lamports: u64) {
  6701. let ancestors = vec![(slot, 0)].into_iter().collect();
  6702. let (account, slot) = self.load_without_fixed_root(&ancestors, &pubkey).unwrap();
  6703. assert_eq!((account.lamports(), slot), (expected_lamports, slot));
  6704. }
  6705. pub fn assert_not_load_account(&self, slot: Slot, pubkey: Pubkey) {
  6706. let ancestors = vec![(slot, 0)].into_iter().collect();
  6707. let load = self.load_without_fixed_root(&ancestors, &pubkey);
  6708. assert!(load.is_none(), "{load:?}");
  6709. }
  6710. pub fn check_accounts(&self, pubkeys: &[Pubkey], slot: Slot, num: usize, count: usize) {
  6711. let ancestors = vec![(slot, 0)].into_iter().collect();
  6712. for _ in 0..num {
  6713. let idx = thread_rng().gen_range(0..num);
  6714. let account = self.load_without_fixed_root(&ancestors, &pubkeys[idx]);
  6715. let account1 = Some((
  6716. AccountSharedData::new(
  6717. (idx + count) as u64,
  6718. 0,
  6719. AccountSharedData::default().owner(),
  6720. ),
  6721. slot,
  6722. ));
  6723. assert_eq!(account, account1);
  6724. }
  6725. }
  6726. /// Iterate over all accounts from all `storages` and call `callback` with each account.
  6727. ///
  6728. /// `callback` parameters:
  6729. /// * Offset: the offset within the file of this account
  6730. /// * StoredAccountInfo: the account itself, with account data
  6731. pub fn scan_accounts_from_storages(
  6732. storages: &[Arc<AccountStorageEntry>],
  6733. mut callback: impl for<'local> FnMut(Offset, StoredAccountInfo<'local>),
  6734. ) {
  6735. let mut reader = append_vec::new_scan_accounts_reader();
  6736. for storage in storages {
  6737. storage
  6738. .accounts
  6739. .scan_accounts(&mut reader, &mut callback)
  6740. .expect("must scan accounts storage");
  6741. }
  6742. }
  6743. /// callers used to call store_uncached. But, this is not allowed anymore.
  6744. pub fn store_for_tests<'a>(&self, accounts: impl StorableAccounts<'a>) {
  6745. self.store_accounts_unfrozen(
  6746. accounts,
  6747. None,
  6748. UpdateIndexThreadSelection::PoolWithThreshold,
  6749. );
  6750. }
  6751. #[allow(clippy::needless_range_loop)]
  6752. pub fn modify_accounts(&self, pubkeys: &[Pubkey], slot: Slot, num: usize, count: usize) {
  6753. for idx in 0..num {
  6754. let account = AccountSharedData::new(
  6755. (idx + count) as u64,
  6756. 0,
  6757. AccountSharedData::default().owner(),
  6758. );
  6759. self.store_for_tests((slot, [(&pubkeys[idx], &account)].as_slice()));
  6760. }
  6761. }
  6762. pub fn check_storage(&self, slot: Slot, alive_count: usize, total_count: usize) {
  6763. let store = self.storage.get_slot_storage_entry(slot).unwrap();
  6764. assert_eq!(store.count(), alive_count);
  6765. assert_eq!(store.accounts_count(), total_count);
  6766. }
  6767. pub fn create_account(
  6768. &self,
  6769. pubkeys: &mut Vec<Pubkey>,
  6770. slot: Slot,
  6771. num: usize,
  6772. space: usize,
  6773. num_vote: usize,
  6774. ) {
  6775. let ancestors = vec![(slot, 0)].into_iter().collect();
  6776. for t in 0..num {
  6777. let pubkey = solana_pubkey::new_rand();
  6778. let account =
  6779. AccountSharedData::new((t + 1) as u64, space, AccountSharedData::default().owner());
  6780. pubkeys.push(pubkey);
  6781. assert!(self.load_without_fixed_root(&ancestors, &pubkey).is_none());
  6782. self.store_for_tests((slot, [(&pubkey, &account)].as_slice()));
  6783. }
  6784. for t in 0..num_vote {
  6785. let pubkey = solana_pubkey::new_rand();
  6786. let account =
  6787. AccountSharedData::new((num + t + 1) as u64, space, &solana_vote_program::id());
  6788. pubkeys.push(pubkey);
  6789. let ancestors = vec![(slot, 0)].into_iter().collect();
  6790. assert!(self.load_without_fixed_root(&ancestors, &pubkey).is_none());
  6791. self.store_for_tests((slot, [(&pubkey, &account)].as_slice()));
  6792. }
  6793. }
  6794. // With obsolete accounts marked, obsolete references are marked in the storage
  6795. // and no longer need to be referenced. This leads to a static reference count
  6796. // of 1. As referencing checking is common in tests, this test wrapper abstracts the behavior
  6797. pub fn assert_ref_count(&self, pubkey: &Pubkey, expected_ref_count: RefCount) {
  6798. let expected_ref_count = match self.mark_obsolete_accounts {
  6799. MarkObsoleteAccounts::Disabled => expected_ref_count,
  6800. // When obsolete accounts are marked, the ref count is always 1 or 0
  6801. MarkObsoleteAccounts::Enabled => expected_ref_count.min(1),
  6802. };
  6803. assert_eq!(
  6804. expected_ref_count,
  6805. self.accounts_index.ref_count_from_storage(pubkey)
  6806. );
  6807. }
  6808. pub fn alive_account_count_in_slot(&self, slot: Slot) -> usize {
  6809. self.storage
  6810. .get_slot_storage_entry(slot)
  6811. .map(|storage| storage.count())
  6812. .unwrap_or(0)
  6813. .saturating_add(
  6814. self.accounts_cache
  6815. .slot_cache(slot)
  6816. .map(|slot_cache| slot_cache.len())
  6817. .unwrap_or_default(),
  6818. )
  6819. }
  6820. /// useful to adapt tests written prior to introduction of the write cache
  6821. /// to use the write cache
  6822. pub fn flush_root_write_cache(&self, root: Slot) {
  6823. assert!(
  6824. self.accounts_index
  6825. .roots_tracker
  6826. .read()
  6827. .unwrap()
  6828. .alive_roots
  6829. .contains(&root),
  6830. "slot: {root}"
  6831. );
  6832. self.flush_accounts_cache(true, Some(root));
  6833. }
  6834. pub fn all_account_count_in_accounts_file(&self, slot: Slot) -> usize {
  6835. let store = self.storage.get_slot_storage_entry(slot);
  6836. if let Some(store) = store {
  6837. store.accounts_count()
  6838. } else {
  6839. 0
  6840. }
  6841. }
  6842. pub fn uncleaned_pubkeys(&self) -> &DashMap<Slot, Vec<Pubkey>, BuildNoHashHasher<Slot>> {
  6843. &self.uncleaned_pubkeys
  6844. }
  6845. }