Compare commits
3 Commits
85f56e55ae
...
8d8a1e0634
Author | SHA1 | Date | |
---|---|---|---|
|
8d8a1e0634 | ||
|
c95b02bd7e | ||
|
6fa03619ee |
@ -9,8 +9,8 @@ import (
|
|||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/server"
|
"github.com/NickyBoy89/spatial-db/server"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
func populateStorageDir(
|
func populateStorageDir(
|
||||||
|
@ -4,7 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/server"
|
"github.com/NickyBoy89/spatial-db/server"
|
||||||
)
|
)
|
||||||
|
|
||||||
var hash server.HashServer
|
var hash server.HashServer
|
||||||
|
@ -4,7 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/server"
|
"github.com/NickyBoy89/spatial-db/server"
|
||||||
)
|
)
|
||||||
|
|
||||||
var inmemory server.InMemoryServer
|
var inmemory server.InMemoryServer
|
||||||
|
@ -5,7 +5,7 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
"github.com/Tnze/go-mc/save"
|
"github.com/Tnze/go-mc/save"
|
||||||
"github.com/Tnze/go-mc/save/region"
|
"github.com/Tnze/go-mc/save/region"
|
||||||
)
|
)
|
||||||
|
@ -5,7 +5,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/storage"
|
"github.com/NickyBoy89/spatial-db/storage"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
6
main.go
6
main.go
@ -1,9 +1,9 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/connector"
|
"github.com/NickyBoy89/spatial-db/connector"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/loading"
|
"github.com/NickyBoy89/spatial-db/loading"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/visualization"
|
"github.com/NickyBoy89/spatial-db/visualization"
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/server"
|
"github.com/NickyBoy89/spatial-db/server"
|
||||||
)
|
)
|
||||||
|
|
||||||
var disk server.SimpleServer
|
var disk server.SimpleServer
|
||||||
|
BIN
paper/block-search.drawio.png
Normal file
BIN
paper/block-search.drawio.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
547
paper/document.tex
Normal file
547
paper/document.tex
Normal file
@ -0,0 +1,547 @@
|
|||||||
|
\documentclass[10pt,twocolumn]{article}
|
||||||
|
|
||||||
|
\usepackage{oxycomps}
|
||||||
|
\bibliography{references}
|
||||||
|
|
||||||
|
\pdfinfo{
|
||||||
|
/Title (SpatialDB: A Database for Storing Dense Three-Dimensional Voxel Structures)
|
||||||
|
/Author (Nicholas Novak)
|
||||||
|
}
|
||||||
|
|
||||||
|
\title{SpatialDB: A Database for Storing Dense Three-Dimensional Voxel Structures}
|
||||||
|
\author{Nicholas Novak}
|
||||||
|
\affiliation{Occidental College}
|
||||||
|
\email{nnovak@oxy.edu}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\maketitle
|
||||||
|
|
||||||
|
\section{Introduction and Problem Context}
|
||||||
|
|
||||||
|
% What my project is
|
||||||
|
In my senior comprehensive project, I have designed and implemented a database
|
||||||
|
application that is designed specifically to store complex shapes in ``voxels'',
|
||||||
|
or three-dimensional pixels.
|
||||||
|
|
||||||
|
% Applications of voxels
|
||||||
|
A voxel\cite{enwiki:1186283262} represents a single point or cube in a
|
||||||
|
three-dimensional grid, at a variable size. This feature allows them to
|
||||||
|
approximately model many three-dimensional structures, and to reduce the
|
||||||
|
computational complexity in analyzing the shape, which has led to many
|
||||||
|
data-related use cases outside of computer science. For example, to model the
|
||||||
|
inner workings of the brain, Neuroscientists track oxygen concentration through
|
||||||
|
neural tissue on a voxel grid as part of fMRI studies\cite{norman2006beyond},
|
||||||
|
and Movie studios such as DreamWorks use voxel data structures to model light
|
||||||
|
reflections for visual effects\cite{museth2013vdb}. The output of MRI scans in
|
||||||
|
hospitals are very high-resolution voxel grids. Most recently, machine learning
|
||||||
|
models are being trained on the LIDAR data from self-driving
|
||||||
|
cars\cite{li2020deep} in order to better process their environments. However,
|
||||||
|
voxels are not often thought of as a way to permanently store three-dimensional shapes, and
|
||||||
|
existing research focuses mainly on efficiently representing and processing
|
||||||
|
shapes. My approach models this problem of voxel storage and representation, and
|
||||||
|
turns it into a problem of database design.
|
||||||
|
|
||||||
|
\subsection{Using Minecraft as a Model for a Database}
|
||||||
|
|
||||||
|
% The problems with Minecraft
|
||||||
|
Minecraft\footnote{https://www.minecraft.net/en-us}, released 2009, is a sandbox
|
||||||
|
game that is played in a world entirely composed of cubic voxels, where the
|
||||||
|
player has complete freedom to manipulate the world by building, destroying, or
|
||||||
|
exploring any part of it. I am focusing this database on the requirements of
|
||||||
|
Minecraft because the game involves some additional challenges that traditional
|
||||||
|
databases do not consider. Primarily, the world of Minecraft is infinite in the
|
||||||
|
horizontal $x$ and $z$ axes, but fixed in the $y$ axis, which limits the amount
|
||||||
|
of information that can be stored by the database at once. The world also
|
||||||
|
contains a denser voxel grid than in many other applications, meaning that far
|
||||||
|
more of the blocks in the world are filled than empty.
|
||||||
|
|
||||||
|
A game is also a real-time application, which means that any performance issues
|
||||||
|
will be immediately be present to the user. Most databases can be evaluated on
|
||||||
|
only their speed, but as the Minecraft server processes new information 20 times
|
||||||
|
per second, the game has a time budget of 50ms to handle all game logic,
|
||||||
|
including the storing of data. Less time processing the data in the world means
|
||||||
|
that more time will be freed up for the game to process other work, although
|
||||||
|
finishing work earlier will not necessarily be faster for the end user, if it
|
||||||
|
still under the budget of 50ms. Most databases do not meet this requirement, and
|
||||||
|
even though they may be faster, their complexity does not mean that they will
|
||||||
|
always finish operations within this time limit.
|
||||||
|
|
||||||
|
These limitations also make Minecraft unable to take advantage of a cache, since
|
||||||
|
the number of different operations that can be done on the world is infinitely
|
||||||
|
large, remembering any previous operations will often not be helpful for the
|
||||||
|
system's performance. Minecraft also provides a good benchmark for the database,
|
||||||
|
because the unpredictability of players stresses the system's ability to return
|
||||||
|
results in a variety of settings.
|
||||||
|
|
||||||
|
\section{Technical Background}
|
||||||
|
|
||||||
|
\subsection{What is a database?}
|
||||||
|
When I refer to the concept of a database, I am referencing a program that sits
|
||||||
|
more or less as a ``black box'' between the user and a software application,
|
||||||
|
storing any data required for the application. In most existing applications,
|
||||||
|
this is done by a category of databases called ``relational databases'', which
|
||||||
|
offer a very general-purpose way to store user data that is highly connected.
|
||||||
|
For instance, a person stored in a relational database would be efficiently
|
||||||
|
linked to with any of their associated information, such as name or age.
|
||||||
|
|
||||||
|
% The model of a database
|
||||||
|
In database terms, any amount of data added to the database is called a
|
||||||
|
``write'', data retrieved from the database is called a ``read'', and any
|
||||||
|
questions asked, such as ``how many people have done this'', are called
|
||||||
|
``queries''. Developers ask these questions through computer languages, one such
|
||||||
|
example being Structured Query Language or SQL, which allow
|
||||||
|
the database to be queried efficiently.
|
||||||
|
|
||||||
|
\subsection{Challenges With Existing Databases}
|
||||||
|
|
||||||
|
% Software development and SQL
|
||||||
|
Most software engineering projects start with a simple front-end and back-end,
|
||||||
|
typically implemented with some sort of Model-View-Controller architecture, and
|
||||||
|
connected to a relational SQL database \cite{sqliteOnlyDatabase}. This idea was
|
||||||
|
popularized by frameworks such as Ruby on Rails and Django, where the model was
|
||||||
|
most often modeled by structures within the database. This framework allowed
|
||||||
|
software developers to not have to worry about inner workings of the database,
|
||||||
|
and focus on writing business logic. This is how many start-ups were built, such
|
||||||
|
as GitHub \cite{githubSingleSQL}, who recently moved off its single SQL database
|
||||||
|
after 13 years, citing performance issues.
|
||||||
|
|
||||||
|
% Challenges with working with SQL: Performance
|
||||||
|
Using a single SQL-speaking database can be a significant advantage development
|
||||||
|
speed, but the database can have some issues keeping up with the demands of the
|
||||||
|
application as the performance requirements expand.
|
||||||
|
% Caching
|
||||||
|
As soon as this happens, companies typically put smaller caching applications in
|
||||||
|
front of their database, such as \verb|Redis|\footnote{https://redis.io/},
|
||||||
|
\verb|memcached|\cite{nishtala2013scaling}, or \verb|TAO| \cite{bronson2013tao},
|
||||||
|
to allow the application to remember some of the commonly asked questions and
|
||||||
|
reduce load on the database by not having to do the same work again.
|
||||||
|
|
||||||
|
\subsubsection{The Complexity of General-Purpose Databases}
|
||||||
|
% What is being done about this
|
||||||
|
Modern SQL databases are also very complex. Three of the most popular SQL
|
||||||
|
databases, PostreSQL, MySQL and Sqlite have 1.4 million lines
|
||||||
|
\footnote{https://wiki.postgresql.org/wiki/GSoC\_2018, in reference to the
|
||||||
|
text ``PostgreSQL is over 1.3M lines of code and some of the code paths can be
|
||||||
|
tricky to reach.''} of code, 2.9 million lines
|
||||||
|
\footnote{https://www.openhub.net/p/mysql}, and 150,000 lines
|
||||||
|
\footnote{https://www.sqlite.org/testing.html} respectively.
|
||||||
|
|
||||||
|
% Why are databases inefficient?
|
||||||
|
Why are databases so complex? Most of the reason for the complexity is that
|
||||||
|
because these database systems so general-purpose, they cannot assume anything
|
||||||
|
about the data stored in them. For the database, finding an efficient plan to
|
||||||
|
answer each query is a known NP-hard problem\cite{chatterji2002complexity}, and
|
||||||
|
to keep itself fast, the database must construct this plan with a complex set of
|
||||||
|
approximations, based on the assumptions that it can make, which leads to
|
||||||
|
ever-evolving complexity.
|
||||||
|
|
||||||
|
% Impossible to maintain
|
||||||
|
With this complexity, it is impossible for a single person to understand the
|
||||||
|
complete inner workings of a database. Thus, the problem of the company's
|
||||||
|
database often becomes a dedicated person in companies that can afford it, or
|
||||||
|
become entire teams of engineers at larger organizations such as
|
||||||
|
Google\cite{googlePerfTeam}.
|
||||||
|
|
||||||
|
% Intro to special-purpose databases
|
||||||
|
What happens in the larger companies that can afford more engineering time, and
|
||||||
|
have a specific problem that they cannot solve with a traditional database?
|
||||||
|
Typically, this leads to the creation of special-purpose database solutions. For
|
||||||
|
instance, the global scale of iCloud and Apple's cloud solutions required them
|
||||||
|
to create FoundationDB\cite{zhou2021foundationdb}. A different set of challenges
|
||||||
|
in the Facebook inbox led to the creation of Apache
|
||||||
|
Cassandra\cite{lakshman2010cassandra}, which is optimized to allow for many
|
||||||
|
emails to be received, at the expense of search speed, which is done far less
|
||||||
|
frequently.
|
||||||
|
|
||||||
|
\subsubsection{The Special-Purpose Database}
|
||||||
|
|
||||||
|
Limiting a database's design to a specific use-case can make the development
|
||||||
|
process much simpler, to the point where it can be done by a single person, and
|
||||||
|
can offer higher performance. The first question that needs to be asked is
|
||||||
|
whether the application is \textit{write-heavy} or \textit{read-heavy}.
|
||||||
|
Read-heavy applications occur often in web development, and most social media
|
||||||
|
platforms have far more users reading the content, than writing new content for
|
||||||
|
the platform. In contrast, write-heavy applications are often seen in analytics
|
||||||
|
workloads, where data is written from many sources, and analyzed infrequently by
|
||||||
|
users.
|
||||||
|
|
||||||
|
My application has a relatively even write and read balance, and I evaluated
|
||||||
|
three different storage data structures before choosing to implement my own
|
||||||
|
|
||||||
|
% Special-purpose databases
|
||||||
|
Recently, companies such as Tigerbeetle\cite{tigerbeetleDesign} have taken this
|
||||||
|
domain-driven approach to database design even further, while designing a
|
||||||
|
database from the ground up to do financial accounting, which outperforms a
|
||||||
|
reference MySQL implementation at 76 accounting transactions per second, to
|
||||||
|
1,757 transactions per second \cite{tigerbeetlePerf}. This highly specialized
|
||||||
|
and domain-specific approach to creating databases is what my project is going
|
||||||
|
to be based on, to create a database around the challenges that the game
|
||||||
|
Minecraft has.
|
||||||
|
|
||||||
|
\subsubsection{Key-Value Databases}
|
||||||
|
|
||||||
|
One of the main architectures that I considered for my project is a design
|
||||||
|
called a key-value store\cite{kvdatabase}, which would store the relationship of
|
||||||
|
a single voxel to its value. Many other voxel databases use this method to
|
||||||
|
achieve constant-time operations on retrieving points, which means that
|
||||||
|
regardless of the size of the dataset, the database will always be able to
|
||||||
|
return a result in the same amount of time. This structures is behind many of
|
||||||
|
the high-performance caches that are commonly used to speed up web applications,
|
||||||
|
such as Redis and RocksDB\cite{dong2021rocksdb}. In order to provide high speeds
|
||||||
|
for this data, the key-value mappings are usually stored in main memory, which
|
||||||
|
is far more expensive and limited than the system's disk drive, but offers
|
||||||
|
a speed advantage of several orders of magnitude\cite{latencyKnow}.
|
||||||
|
|
||||||
|
\section{Prior Work}
|
||||||
|
|
||||||
|
\subsection{Voxels for Efficient Computation}
|
||||||
|
|
||||||
|
Most existing literature on the topic of using voxels to store shapes focuses on
|
||||||
|
the application of the voxel grid for efficient computation. Since voxel points
|
||||||
|
are completely independent of each other, this allows for efficient parallel
|
||||||
|
processors, which are increasingly more common on consumer hardware, to take
|
||||||
|
advantage of this speedup. In VDB\cite{museth2013vdb} Museth demonstrates that
|
||||||
|
by modeling a sparse voxel grid in different resolutions, a computer cluster can
|
||||||
|
efficiently approximate a physical structures such as a cloud, in order to
|
||||||
|
calculate expensive lighting operations.
|
||||||
|
% Parallel processing on voxels
|
||||||
|
Williams\cite{williams1992voxel} expands upon the uses of a voxel database to
|
||||||
|
model graph and mesh-based problems. Taking advantage of the parallelism in the
|
||||||
|
grid, many problems can be reframed in the representation of voxels, and solve
|
||||||
|
those problems far more efficiently. This model however, assumes that every
|
||||||
|
voxel is stored in shared memory, making this process only viable to solve
|
||||||
|
problems that can be modeled on one machine, and are far more computationally
|
||||||
|
expensive, rather than data-intensive.
|
||||||
|
|
||||||
|
\subsection{Storing Large Voxel Data Sets}
|
||||||
|
|
||||||
|
Another approach to the problem of storing voxel data is the distributed
|
||||||
|
approach in Gorte et. al. \cite{gorte2023analysis}. Since memory is limited
|
||||||
|
within one computer, this workload can be split up between many servers, which
|
||||||
|
allows very large datasets to be worked on by a single workstation through an
|
||||||
|
API. This method keeps many of the same performance considerations, but also
|
||||||
|
assumes that the voxel data is not very dense, and uses a three-dimensional
|
||||||
|
data structure called an octree, which allows the user to change the resolution
|
||||||
|
of the data that they are working on. In the paper, Gorte acknowledges the need
|
||||||
|
to split large datasets up into smaller regions, which is similar to the concept
|
||||||
|
of ``chunks'' in my implementation.
|
||||||
|
|
||||||
|
\subsection{Chunk Systems in Other Games}
|
||||||
|
|
||||||
|
The decision to choose chunks to represent game data has many justifications. As
|
||||||
|
\cite{gorte2023analysis} mentions, an infinite grid of voxels needs to be broken
|
||||||
|
up in a way where applications can store data in an efficient way, and many
|
||||||
|
other games converge on this same implementation. Another voxel-based game,
|
||||||
|
Veloren\cite{https://veloren.net} uses the same chunk-based system, although
|
||||||
|
differs in its storage method. The game switches between several different
|
||||||
|
storage implementations in each chunk, depending on how dense or sparse the voxel
|
||||||
|
data within the chunk is. For sparser data, the game stores block information in
|
||||||
|
a simple key-value hash map. As the number of voxels increase, the game further
|
||||||
|
breaks this information up, and creates several smaller sections within the
|
||||||
|
chunk. Finally, for very dense data, the game stores a compressed version using
|
||||||
|
Zlib compression\cite{veloren32}. This gives many options for data compression
|
||||||
|
in my database, but also shows how the database can be adapted to store sparser
|
||||||
|
structures more efficiently if the focus of the project ever needs to change.
|
||||||
|
Since this game is not based on Minecraft, but an independent project named cube
|
||||||
|
world, the game comes up with a similar data structure, and shows the
|
||||||
|
performance considerations for using such a structure. The benchmarks that they
|
||||||
|
show suggest about an order-of-magnitude improvement over using a key-value
|
||||||
|
store.
|
||||||
|
|
||||||
|
\subsection{Previous Special-Purpose Databases}
|
||||||
|
|
||||||
|
The design of my database was also inspired by the LSM tree and data-driven
|
||||||
|
designs of Tigerbeetle\cite{tigerbeetleDesign}, which is also able to handle
|
||||||
|
concurrent operations on the same design. Another database,
|
||||||
|
CockroachDB\footnote{https://www.cockroachlabs.com/product/}, uses a key-value
|
||||||
|
mapping backend to store a SQL-like tables and rows. Finally, the design of
|
||||||
|
caching layers in modern SQL caches such as Noria\cite{gjengset2018noria} show
|
||||||
|
that it it possible to efficiently remember the complex queries found in SQL,
|
||||||
|
and replicate these in real-time.
|
||||||
|
|
||||||
|
\section{Methods}
|
||||||
|
|
||||||
|
\subsection{The Interface for the Database}
|
||||||
|
|
||||||
|
For developers to interact with the database, the database is implemented as a
|
||||||
|
library, and the database provides a simple application programming interface to
|
||||||
|
read and write data, consisting of the following operations. The performance
|
||||||
|
considerations for each of these operations can be found in the methods section
|
||||||
|
below.
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Read a single block
|
||||||
|
\item Write a single block
|
||||||
|
\item Change a range of blocks
|
||||||
|
\item Read a pre-defined ``chunk'' of blocks
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Reading and Writing a Single Voxel}
|
||||||
|
|
||||||
|
The process of updating the data for a single point in the world starts with the
|
||||||
|
voxel's position. Because the world is infinite on the horizontal $x$ and $z$
|
||||||
|
axes, this is implemented by a system of ``chunks'', which are fixed-size 16x16
|
||||||
|
columns of voxels, 256 voxels high. The size of these chunks are chosen so that
|
||||||
|
they are large enough to be efficiently cached, and many operations can occur
|
||||||
|
within the same chunk, but not too large to the point where the hundred or so
|
||||||
|
chunks sent to the user upon joining the world cause a network slowdown. Given a
|
||||||
|
point's $x$ and $z$ positions, the chunk that that voxel belongs to can be found
|
||||||
|
with a fast modulus operation, in constant time.
|
||||||
|
|
||||||
|
To fetch the data for that chunk, the database needs to read that data from
|
||||||
|
disk. The database stores this information in combined files that I call ``unity
|
||||||
|
files'' (shown in figure \ref{fig:unity}), which consist of a single file on disk, but with the encoded data for
|
||||||
|
each chunk stored as a start index and size, so that the \verb|seek| syscall can
|
||||||
|
be used to efficiently query this data, while only keeping one file open. This
|
||||||
|
scheme was used over the previous system of storing chunk files separately,
|
||||||
|
because the filesystem had a hard time searching through the hundreds of
|
||||||
|
thousands of chunks in larger worlds. This start position and size are stored in
|
||||||
|
an auxillary hash map that stores a mapping of every chunk's position to its
|
||||||
|
metadata within the unity file. This structure uses a minimal amount of memory,
|
||||||
|
and also allows for a file to be fetched from disk in a constant amount of time
|
||||||
|
and disk reads.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=8cm]{unity-file.drawio.png}
|
||||||
|
\caption{The Layout of a Unity File}
|
||||||
|
\label{fig:unity}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
Each chunk is further divided into sections, in this case each chunk consists of
|
||||||
|
16 stacked 16x16x16 cubes of voxels, which results in a total of 4096 block
|
||||||
|
states per section. Using the voxel's $y$ position, the section for a block can
|
||||||
|
be found with another modulus. Once this is found, a perfect hash function is
|
||||||
|
used to map the voxel's position to an array index within the section. Again,
|
||||||
|
both of these steps are done in constant time respectively.
|
||||||
|
|
||||||
|
Every section additionally stores a look-up-table, that stores a mapping of a
|
||||||
|
\textit{palette index} to the state of a block. When the value for the point is
|
||||||
|
retrieved from the section, the value returned is not the block's state, but
|
||||||
|
simply an index into this palette. The palette lookup is done in constant time,
|
||||||
|
and when a new block is added into the section that needs an additional state in
|
||||||
|
the palette, this value is added in constant time as well. The existence of this
|
||||||
|
palette supports the efficient operation changing large portions of blocks in
|
||||||
|
the world.
|
||||||
|
|
||||||
|
Once the value of the point is found in the palette, the value can be returned
|
||||||
|
to the user. A visual diagram of this process can be found in figure
|
||||||
|
\ref{fig:lookup}.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=8cm]{block-search.drawio.png}
|
||||||
|
\caption{The process of looking up a single block}
|
||||||
|
\label{fig:lookup}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The ability to change a region of blocks is also a common operation within the
|
||||||
|
database, and which isn't locked to a specific range of chunks. This operation
|
||||||
|
is implemented as overwriting the palettes for a specific region. By overwriting
|
||||||
|
every palette index to the same value, every value in the chunk effectively gets
|
||||||
|
set to the same value. This does however create the need for an additional
|
||||||
|
``compaction'' step, where the palette is shrunk to remove duplicate values, and
|
||||||
|
every block within the section must be updated to point to the correct index in
|
||||||
|
the palette. This compaction is done upon any subsequent writes to the section
|
||||||
|
by inserting a block, because only this fixed-size section needs to be changed,
|
||||||
|
preserving the time of the operation as constant time.
|
||||||
|
|
||||||
|
Finally, the retrieval of a single chunk can be done efficiently, because the
|
||||||
|
database already stores chunks separately, and serializes these to the client.
|
||||||
|
|
||||||
|
% \cite{vohra2016apache}.
|
||||||
|
|
||||||
|
\section{Evaluation Metrics}
|
||||||
|
|
||||||
|
\subsection{Reading Single Voxels}
|
||||||
|
|
||||||
|
Reads and writes of single voxels are the most common fundamental operation for
|
||||||
|
my database, and the database should be handle this operation in the same amount
|
||||||
|
of time, regardless of the size of the world. Both my implementation and the
|
||||||
|
simpler key-value store meet this criteria.
|
||||||
|
|
||||||
|
\subsection{Changing Regions of Voxels}
|
||||||
|
|
||||||
|
Changing regions of voxels should be able to be done in linear time. This is
|
||||||
|
because resetting or changing a region of voxels is important while drawing
|
||||||
|
shapes of various resolutions. Lower resolution shapes are less precise, and
|
||||||
|
thus are able to be written faster.
|
||||||
|
|
||||||
|
\subsection{Memory Requirements}
|
||||||
|
|
||||||
|
The memory requirement is set quite low, at 256MB, in order to require the
|
||||||
|
database to store most of its data on disk, and limit its memory usage to
|
||||||
|
important caching features. This limitation was chosen for larger datasets that
|
||||||
|
don't fit within memory on a single machine, because memory is much more
|
||||||
|
expensive than disk storage, and would limit the analysis to smaller voxel grids.
|
||||||
|
|
||||||
|
\subsection{Reading Regions of Voxels}
|
||||||
|
|
||||||
|
The ability to retrieve large shapes from the database is important, because in
|
||||||
|
order to export a shape, another operation must be present to efficiently do
|
||||||
|
this. This operation therefore must be done in constant time, because as
|
||||||
|
Gorte\cite{gorte2023analysis} identifies, many researchers might want to work
|
||||||
|
on the same dataset, and exporting all this data would become inefficient for
|
||||||
|
the database to process. In the use-case of Minecraft, this allows the server to
|
||||||
|
support many more players at once, by not sending every individual block to each
|
||||||
|
client. This requirement is not met by the key-value database, but is reached by
|
||||||
|
my implementation, by sending the stored chunks on disk.
|
||||||
|
|
||||||
|
\subsection{Reading Neighboring Blocks}
|
||||||
|
|
||||||
|
The last common operation in most voxel databases is the ability to read points
|
||||||
|
that are neighboring another point. This is important because many voxel shapes
|
||||||
|
approximate cubic shapes \cite{gorte2023analysis}, and in Minecraft, players are
|
||||||
|
constantly affecting voxels that are nearer to each other.
|
||||||
|
|
||||||
|
\section{Results and Discussion}
|
||||||
|
|
||||||
|
Benchmarking on my laptop, inserting values in various spreads around the voxel
|
||||||
|
world, I get the following benchmarks, comparing an in-memory implementation of
|
||||||
|
SpatialDB, the disk-based implementation of SpatialDB, and a memory-based
|
||||||
|
key-value implementation in figure \ref{fig:reads}:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{c | c | c | c}
|
||||||
|
Spread of Points & In-memory & Disk & KeyValue\\
|
||||||
|
\hline
|
||||||
|
128 & 4275 & 4146669 & 176.7\\
|
||||||
|
512 & 4184 & 3319162 & 190.6\\
|
||||||
|
2048 & 2613 & 422938 & 184.8\\
|
||||||
|
65536 & 2382 & 18814 & 186.1
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Time (in ns) to operate on a single voxel, based on the size of the
|
||||||
|
world (spread)}
|
||||||
|
\label{fig:reads}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
These results show that the scaling remains consistent between the in-memory
|
||||||
|
version and the key-value store, although my implementation is about two orders
|
||||||
|
of magnitude slower than the latter. This scaling is however not met by the
|
||||||
|
performance of the on-disk database. Originally, I thought that these poor
|
||||||
|
results were the result of no caching being done on the chunk files, which would
|
||||||
|
have made searches much slower, but still doesn't explain the improvement in
|
||||||
|
performance by larger worlds. This led me to implement a disk cache, which had
|
||||||
|
similar results, to the final implementation where I combined all the data in
|
||||||
|
one large file, and selectively read sections from that file. This leads me to
|
||||||
|
believe that as the points tested grow more spread out, since the world is only
|
||||||
|
so large, many points will be outside of the loaded chunks, and instantly return
|
||||||
|
empty.
|
||||||
|
|
||||||
|
This change could likely be addressed by a change in caching methods, and
|
||||||
|
remembering the data for more chunks, but this still doesn't address the slow
|
||||||
|
speeds for accessing data in the first place. The slow speeds are most likely
|
||||||
|
the decoding of the JSON data stored on disk, which is relatively large at
|
||||||
|
about 4 megabytes in size. A custom encoding method could be designed to replace
|
||||||
|
this scheme, or additionally pre-allocate the entire storage space in the
|
||||||
|
chunks, so that chunk data could be retrieved without decoding the entire chunk.
|
||||||
|
However, this would require a much more constrained data layout, and limit the
|
||||||
|
implementation of different voxels.
|
||||||
|
|
||||||
|
Additionally, compression would also reduce the amount of data sent from the
|
||||||
|
disk to the application.
|
||||||
|
|
||||||
|
\section{Ethical Considerations}
|
||||||
|
|
||||||
|
\subsection{Considerations of Computing Resources}
|
||||||
|
|
||||||
|
Since a database is at the core part of most software systems, it is important
|
||||||
|
that the database is designed to work on a wide variety of computers, in order
|
||||||
|
to ensure all parties are able to take advantage of the improvements. I
|
||||||
|
designed my database to run on entry-level commodity hardware, as well as
|
||||||
|
alongside existing application programs that can require far more resources.
|
||||||
|
Additionally, by focusing on disk storage, which is far cheaper than equivalent
|
||||||
|
capacities of memory, this further allows researchers or individuals to run
|
||||||
|
large datasets on a single machine.
|
||||||
|
|
||||||
|
My system targets far less memory usage than existing commercial applications
|
||||||
|
\footnote{\url{https://docs.oracle.com/en/database/oracle/oracle-database/12.2/ntdbi/oracle-database-minimum-hardware-requirements.html}}
|
||||||
|
\footnote{\url{https://wiki.lustre.org/Lustre_Server_Requirements_Guidelines}}.
|
||||||
|
In the design of my application I had to take advantage of as much of the
|
||||||
|
computing hardware as possible, but make sure that the approachability and
|
||||||
|
accessibility for the application does not decrease as as result.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Considerations of Complexity}
|
||||||
|
Another factor to consider in the implementation of my database is how complex
|
||||||
|
the existing systems are. Some of the most popular SQL databases, PostgreSQL and
|
||||||
|
MySQL have 1.4 and 4.4 million lines of code respectively
|
||||||
|
\footnote{\url{https://news.ycombinator.com/item?id=24813239}}.
|
||||||
|
|
||||||
|
Because these systems are so complex, this decreases the number of people who
|
||||||
|
can effectively work with these systems and maintain them, effectively limiting
|
||||||
|
this role to larger companies that can afford teams of people to solve these
|
||||||
|
problems for them. By not focusing on the significant complexity that comes with
|
||||||
|
caching logic, and keeping a simple implementation for the server, I allow more
|
||||||
|
companies and developers to use this database for their own needs, and expand
|
||||||
|
with them. In addition, many decisions were made to help in the debugging
|
||||||
|
process, including the choice of JSON serialization for the chunk data, which
|
||||||
|
allows users to read the contents of files easier, and recover potentially
|
||||||
|
corrupted data.
|
||||||
|
|
||||||
|
\subsection{Considerations in Security}
|
||||||
|
|
||||||
|
Since databases are very complex, there is also the risk that having a
|
||||||
|
server exposed over the internet through the Minecraft game server might leave
|
||||||
|
it exposed to attacks. While this is a large issue, an even more important
|
||||||
|
implication is the ability to configure the database correctly. Since these
|
||||||
|
databases are extremely complex, it is also very hard to make sure that they are
|
||||||
|
configured securely. There have been many high-profile data
|
||||||
|
breaches\footnote{\url{https://www.zdnet.com/article/hacker-ransoms-23k-mongodb-databases-and-threatens-to-contact-gdpr-authorities/}}
|
||||||
|
that involve a single server, even at larger companies that have dedicated teams
|
||||||
|
that involve a data breach.
|
||||||
|
|
||||||
|
I mitigate this risk by implementing the database in a memory-safe
|
||||||
|
programming language, Go, which should remove the risk class of memory-unsafety
|
||||||
|
bugs, which account for around 70\% of all bugs in the Chromium browser
|
||||||
|
engine\footnote{\url{https://www.chromium.org/Home/chromium-security/memory-safety/}},
|
||||||
|
which is entirely written in non-memory safe C++.
|
||||||
|
|
||||||
|
However, there is the possibility that information stored in the database is
|
||||||
|
exposed, whether the database not secured, or exposed via an application error.
|
||||||
|
With this, my database follows the previous threat model of many other
|
||||||
|
databases, and leaves the security up to the user implementing the application.
|
||||||
|
Implementing features such as encryption would provide some additional layer of
|
||||||
|
security, but would also likely decrease performance and increase complexity,
|
||||||
|
which are also harmful to security in their own ways. Ultimately, I rely on a
|
||||||
|
setting of defaults that doesn't many any assumptions about the security of the
|
||||||
|
system.
|
||||||
|
|
||||||
|
\subsection{Considerations in Fairness}
|
||||||
|
|
||||||
|
In the implementation of databases, it can often be beneficial to make certain
|
||||||
|
operations faster, at the expense of others that are not done as often. For
|
||||||
|
instance, if I notice that researchers often write more to the database, and
|
||||||
|
adjust the application accordingly, I can take advantage of this assumption to
|
||||||
|
speed up the database for the most common operations. However, this can be
|
||||||
|
problematic if the things that I choose to sacrifice affect a certain group of
|
||||||
|
users.
|
||||||
|
|
||||||
|
This tradeoff between speed and reliability occurs so often in Computer Science
|
||||||
|
and is described in terms of percentiles. For instance, if we notice that some
|
||||||
|
event occurs about half the time, we can say it is in the 50th percentile.
|
||||||
|
Similarly, if an event only occurs 1\% of the time, we can say it occurs in the
|
||||||
|
99th percentile. The impossible effect of not hurting anyone when a decision
|
||||||
|
like this is make is written about by Google \cite{dean2013tail}, who have to make every
|
||||||
|
decision like this at their scale.
|
||||||
|
|
||||||
|
My database plans to keep a consistent set of gaurantees in regards to the
|
||||||
|
complexity of the basic operations, and provide constant-time operations for
|
||||||
|
most of these operations.
|
||||||
|
|
||||||
|
\subsection{Considerations in Accessibility}
|
||||||
|
|
||||||
|
By creating this system, I also have to consider if the players are going to
|
||||||
|
require a certain type of computer. Requiring a certain operating system or a
|
||||||
|
more powerful computer would limit access to many of the people that were
|
||||||
|
playing the game before.
|
||||||
|
|
||||||
|
However, with the previous performance goals, as well as an implementation in a
|
||||||
|
portable language, the program is available for as many systems as the Go
|
||||||
|
compiler supports.
|
||||||
|
|
||||||
|
\printbibliography
|
||||||
|
|
||||||
|
\end{document}
|
102
paper/oxycomps.sty
Normal file
102
paper/oxycomps.sty
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
% A simple two-column LaTeX style for Occidental College's CS senior projects.
|
||||||
|
% Based on latex8.sty by Paolo.Ienne@di.epfl.ch
|
||||||
|
|
||||||
|
\usepackage{times} % use Times as the default font
|
||||||
|
% define bold 11pt Times font for second-order headings
|
||||||
|
\font\elvbf = ptmb scaled 1100
|
||||||
|
|
||||||
|
\usepackage[style=numeric,sorting=nyt]{biblatex} % format the bibliography nicely
|
||||||
|
\usepackage{xpatch} % used to patch \textcite
|
||||||
|
|
||||||
|
% change \textcite to do family-name (year)
|
||||||
|
\xpatchbibmacro{textcite}
|
||||||
|
{\printnames{labelname}}
|
||||||
|
{\printnames{labelname} (\printfield{year})}
|
||||||
|
{}
|
||||||
|
{}
|
||||||
|
% sort bibliography by last name
|
||||||
|
\DeclareNameAlias{default}{family-given}
|
||||||
|
|
||||||
|
\usepackage{amsfonts} % provides many math symbols/fonts
|
||||||
|
\usepackage{amsmath} % provides many math environments
|
||||||
|
\usepackage{amssymb} % provides many math symbols/fonts
|
||||||
|
\usepackage{caption} % fixes caption spacing issues
|
||||||
|
\usepackage[usenames,dvipsnames]{color} % allows for colored text
|
||||||
|
\usepackage{enumitem} % allows adjustment of list spacing
|
||||||
|
\usepackage{graphicx} % allows insertion of graphics
|
||||||
|
\usepackage{hyperref} % creates links within the page and to URLs
|
||||||
|
\usepackage{listings} % provides the lstlisting environment
|
||||||
|
\usepackage{url} % formats URLs properly
|
||||||
|
\usepackage{verbatim} % provides the comment environment
|
||||||
|
|
||||||
|
% set dimensions of columns, gap between columns, and paragraph indent
|
||||||
|
\setlength{\textheight}{8.875in}
|
||||||
|
\setlength{\textwidth}{6.875in}
|
||||||
|
\setlength{\columnsep}{0.3125in}
|
||||||
|
\setlength{\topmargin}{0in}
|
||||||
|
\setlength{\headheight}{0in}
|
||||||
|
\setlength{\headsep}{0in}
|
||||||
|
\setlength{\parindent}{1em}
|
||||||
|
\setlength{\oddsidemargin}{-.304in}
|
||||||
|
\setlength{\evensidemargin}{-.304in}
|
||||||
|
|
||||||
|
% remove the space between list items
|
||||||
|
\setlist{noitemsep}
|
||||||
|
|
||||||
|
% style code listings
|
||||||
|
\lstset{
|
||||||
|
basicstyle=\ttfamily\footnotesize,
|
||||||
|
breaklines=true,
|
||||||
|
showstringspaces=false
|
||||||
|
}
|
||||||
|
|
||||||
|
% style the title
|
||||||
|
\def\@maketitle{
|
||||||
|
\newpage
|
||||||
|
\begin{center}
|
||||||
|
{\Large \bf \@title \par}
|
||||||
|
% add two empty lines at the end of the title
|
||||||
|
\vspace*{2\baselineskip}
|
||||||
|
{
|
||||||
|
\large
|
||||||
|
\begin{tabular}[t]{c}
|
||||||
|
\@author
|
||||||
|
\end{tabular}
|
||||||
|
\par
|
||||||
|
}
|
||||||
|
% add small space at the end of the author name
|
||||||
|
\vspace*{.5em}
|
||||||
|
{
|
||||||
|
\ifx \@empty \@email
|
||||||
|
\else
|
||||||
|
\texttt{\@email}
|
||||||
|
\par
|
||||||
|
\vspace*{.25em}
|
||||||
|
\fi
|
||||||
|
\ifx \@empty \@affiliation
|
||||||
|
\else
|
||||||
|
\@affiliation
|
||||||
|
\fi
|
||||||
|
}
|
||||||
|
% add empty line at the end of the title block
|
||||||
|
\vspace*{\baselineskip}
|
||||||
|
\end{center}
|
||||||
|
}
|
||||||
|
|
||||||
|
% style the abstract
|
||||||
|
\def\abstract{%
|
||||||
|
\centerline{\large\bf Abstract}%
|
||||||
|
\vspace*{\baselineskip}%
|
||||||
|
}
|
||||||
|
|
||||||
|
% define email and affiliation
|
||||||
|
\def\email#1{\gdef\@email{#1}}
|
||||||
|
\gdef\@email{}
|
||||||
|
\def\affiliation#1{\gdef\@affiliation{#1}}
|
||||||
|
\gdef\@affiliation{}
|
||||||
|
|
||||||
|
% correct heading spacing and type
|
||||||
|
\def\section{\@startsection {section}{1}{\z@}
|
||||||
|
{14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}}
|
||||||
|
\def\subsection{\@startsection {subsection}{2}{\z@}
|
||||||
|
{13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}}
|
315
paper/references.bib
Normal file
315
paper/references.bib
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
// Introduction
|
||||||
|
|
||||||
|
@misc{sqliteOnlyDatabase,
|
||||||
|
title={SQLite the only database you will ever need in most cases},
|
||||||
|
url={https://unixsheikh.com/articles/sqlite-the-only-database-you-will-ever-need-in-most-cases.html},
|
||||||
|
journal={https://unixsheikh.com/},
|
||||||
|
publisher={https://unixsheikh.com/},
|
||||||
|
author={Sheikh, Unix},
|
||||||
|
year={2021},
|
||||||
|
month={Apr},
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{ enwiki:1181180757,
|
||||||
|
author = "{Wikipedia contributors}",
|
||||||
|
title = "Model–view–controller --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
|
year = "2023",
|
||||||
|
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Model%E2%80%93view%E2%80%93controller&oldid=1181180757}",
|
||||||
|
note = "[Online; accessed 13-December-2023]"
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{googlePerfTeam,
|
||||||
|
author = {{Google Performance Team}},
|
||||||
|
title = {System Performance},
|
||||||
|
month = {May},
|
||||||
|
year = {2023},
|
||||||
|
url = {https://research.google/teams/system-performance/},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Applications of voxels
|
||||||
|
|
||||||
|
@misc{ enwiki:1186283262,
|
||||||
|
author = "{Wikipedia contributors}",
|
||||||
|
title = "Voxel --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
|
year = "2023",
|
||||||
|
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Voxel&oldid=1186283262}",
|
||||||
|
note = "[Online; accessed 13-December-2023]"
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{norman2006beyond,
|
||||||
|
title={Beyond mind-reading: multi-voxel pattern analysis of fMRI data},
|
||||||
|
author={Norman, Kenneth A and Polyn, Sean M and Detre, Greg J and Haxby, James V},
|
||||||
|
journal={Trends in cognitive sciences},
|
||||||
|
volume={10},
|
||||||
|
number={9},
|
||||||
|
pages={424--430},
|
||||||
|
year={2006},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{museth2013vdb,
|
||||||
|
title={VDB: High-resolution sparse volumes with dynamic topology},
|
||||||
|
author={Museth, Ken},
|
||||||
|
journal={ACM transactions on graphics (TOG)},
|
||||||
|
volume={32},
|
||||||
|
number={3},
|
||||||
|
pages={1--22},
|
||||||
|
year={2013},
|
||||||
|
publisher={ACM New York, NY, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{li2020deep,
|
||||||
|
title={Deep learning for lidar point clouds in autonomous driving: A review},
|
||||||
|
author={Li, Ying and Ma, Lingfei and Zhong, Zilong and Liu, Fei and Chapman, Michael A and Cao, Dongpu and Li, Jonathan},
|
||||||
|
journal={IEEE Transactions on Neural Networks and Learning Systems},
|
||||||
|
volume={32},
|
||||||
|
number={8},
|
||||||
|
pages={3412--3432},
|
||||||
|
year={2020},
|
||||||
|
publisher={IEEE}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Literature Review
|
||||||
|
|
||||||
|
@article{williams1992voxel,
|
||||||
|
title={Voxel databases: A paradigm for parallelism with spatial structure},
|
||||||
|
author={Williams, Roy D},
|
||||||
|
journal={Concurrency: Practice and Experience},
|
||||||
|
volume={4},
|
||||||
|
number={8},
|
||||||
|
pages={619--636},
|
||||||
|
year={1992},
|
||||||
|
publisher={Wiley Online Library}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{gorte2023analysis,
|
||||||
|
title={Analysis of very large voxel datasets},
|
||||||
|
author={Gorte, Ben},
|
||||||
|
journal={International Journal of Applied Earth Observation and Geoinformation},
|
||||||
|
volume={119},
|
||||||
|
pages={103316},
|
||||||
|
year={2023},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{tigerbeetleDesign,
|
||||||
|
author = {{Tigerbeetle Developers}},
|
||||||
|
title = {Tigerbeetle Design Document},
|
||||||
|
month = {July},
|
||||||
|
year = {2020},
|
||||||
|
url = {https://github.com/tigerbeetledb/tigerbeetle/blob/main/docs/DESIGN.md},
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{tigerbeetlePerf,
|
||||||
|
author = {{Tigerbeetle Developers}},
|
||||||
|
title = {Tigerbeetle Design Document},
|
||||||
|
month = {July},
|
||||||
|
year = {2020},
|
||||||
|
url = {https://github.com/tigerbeetledb/tigerbeetle/blob/main/docs/HISTORY.md},
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{nomiSlowME,
|
||||||
|
author = {{Jokercortex}},
|
||||||
|
title = {Moron's Guide to Managing Mechanical Monstrosities},
|
||||||
|
month = {Feb},
|
||||||
|
year = {2020},
|
||||||
|
url = {https://github.com/Nomifactory/Guides/blob/latest/guides/AE2ForDummies.md},
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{btree,
|
||||||
|
author = "{Wikipedia contributors}",
|
||||||
|
title = "B-tree --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
|
year = "2023",
|
||||||
|
url = "https://en.wikipedia.org/w/index.php?title=B-tree&oldid=1146616935",
|
||||||
|
note = "[Online; accessed 13-May-2023]"
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{kvdatabase,
|
||||||
|
author = "{Wikipedia contributors}",
|
||||||
|
title = "Key–value database --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
|
year = "2023",
|
||||||
|
url = "https://en.wikipedia.org/w/index.php?title=Key%E2%80%93value_database&oldid=1135560734",
|
||||||
|
note = "[Online; accessed 13-May-2023]"
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{latencyKnow,
|
||||||
|
author = "Jeff Dean",
|
||||||
|
title = "Latency Numbers Every Programmer Should Know",
|
||||||
|
year = "2018",
|
||||||
|
url = "https://gist.github.com/jboner/2841832",
|
||||||
|
note = "[Online; accessed 12-Dec-2023]"
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{cockroachData,
|
||||||
|
author = {{CockroachDB Developers}},
|
||||||
|
title = {Structured data encoding in CockroachDB SQL},
|
||||||
|
year = {2017},
|
||||||
|
month = Mar,
|
||||||
|
url = {https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/encoding.md},
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{dong2021rocksdb,
|
||||||
|
title={Rocksdb: Evolution of development priorities in a key-value store serving large-scale applications},
|
||||||
|
author={Dong, Siying and Kryczka, Andrew and Jin, Yanqin and Stumm, Michael},
|
||||||
|
journal={ACM Transactions on Storage (TOS)},
|
||||||
|
volume={17},
|
||||||
|
number={4},
|
||||||
|
pages={1--32},
|
||||||
|
year={2021},
|
||||||
|
publisher={ACM New York, NY}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{lsm,
|
||||||
|
author = "{Wikipedia contributors}",
|
||||||
|
title = "Log-structured merge-tree --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
|
year = "2023",
|
||||||
|
url = "https://en.wikipedia.org/w/index.php?title=Log-structured_merge-tree&oldid=1153046573",
|
||||||
|
note = "[Online; accessed 13-May-2023]"
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{lsmUses,
|
||||||
|
author = {{Braden Groom}},
|
||||||
|
title = {Understanding LSM Trees: What Powers Write-Heavy Databases},
|
||||||
|
month = Jun,
|
||||||
|
year = {2020},
|
||||||
|
url = {https://yetanotherdevblog.com/lsm/},
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{chang2008bigtable,
|
||||||
|
title={Bigtable: A distributed storage system for structured data},
|
||||||
|
author={Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C and Wallach, Deborah A and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E},
|
||||||
|
journal={ACM Transactions on Computer Systems (TOCS)},
|
||||||
|
volume={26},
|
||||||
|
number={2},
|
||||||
|
pages={1--26},
|
||||||
|
year={2008},
|
||||||
|
publisher={ACM New York, NY, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{abadi2008column,
|
||||||
|
title={Column-stores vs. row-stores: how different are they really?},
|
||||||
|
author={Abadi, Daniel J and Madden, Samuel R and Hachem, Nabil},
|
||||||
|
booktitle={Proceedings of the 2008 ACM SIGMOD international conference on Management of data},
|
||||||
|
pages={967--980},
|
||||||
|
year={2008}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{athanassoulis2019optimal,
|
||||||
|
title={Optimal column layout for hybrid workloads},
|
||||||
|
author={Athanassoulis, Manos and B{\o}gh, Kenneth S and Idreos, Stratos},
|
||||||
|
journal={Proceedings of the VLDB Endowment},
|
||||||
|
volume={12},
|
||||||
|
number={13},
|
||||||
|
pages={2393--2407},
|
||||||
|
year={2019},
|
||||||
|
publisher={VLDB Endowment}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{armbrust2021lakehouse,
|
||||||
|
title={Lakehouse: a new generation of open platforms that unify data warehousing and advanced analytics},
|
||||||
|
author={Armbrust, Michael and Ghodsi, Ali and Xin, Reynold and Zaharia, Matei},
|
||||||
|
booktitle={Proceedings of CIDR},
|
||||||
|
volume={8},
|
||||||
|
year={2021}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{dean2013tail,
|
||||||
|
title={The tail at scale},
|
||||||
|
author={Dean, Jeffrey and Barroso, Luiz Andr{\'e}},
|
||||||
|
journal={Communications of the ACM},
|
||||||
|
volume={56},
|
||||||
|
number={2},
|
||||||
|
pages={74--80},
|
||||||
|
year={2013},
|
||||||
|
publisher={ACM New York, NY, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
https://github.blog/2021-09-27-partitioning-githubs-relational-databases-scale/#:~:text=Yet%20at%20its%20core%2C%20GitHub,%2C%20issues%2C%20and%20pull%20requests.
|
||||||
|
@misc{githubSingleSQL,
|
||||||
|
title={Partitioning github’s relational databases to handle scale},
|
||||||
|
url={https://github.blog/2021-09-27-partitioning-githubs-relational-databases-scale/},
|
||||||
|
journal={The GitHub Blog},
|
||||||
|
publisher={GitHub},
|
||||||
|
author={Maurer, Thomas},
|
||||||
|
year={2021},
|
||||||
|
month={Sep},
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{bronson2013tao,
|
||||||
|
title={$\{$TAO$\}$: Facebook’s distributed data store for the social graph},
|
||||||
|
author={Bronson, Nathan and Amsden, Zach and Cabrera, George and Chakka, Prasad and Dimov, Peter and Ding, Hui and Ferris, Jack and Giardullo, Anthony and Kulkarni, Sachin and Li, Harry and others},
|
||||||
|
booktitle={2013 $\{$USENIX$\}$ Annual Technical Conference ($\{$USENIX$\}$$\{$ATC$\}$ 13)},
|
||||||
|
pages={49--60},
|
||||||
|
year={2013}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{chatterji2002complexity,
|
||||||
|
title={On the complexity of approximate query optimization},
|
||||||
|
author={Chatterji, Sourav and Evani, Sai Surya Kiran and Ganguly, Sumit and Yemmanuru, Mahesh Datt},
|
||||||
|
booktitle={Proceedings of the twenty-first ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems},
|
||||||
|
pages={282--292},
|
||||||
|
year={2002}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{gjengset2018noria,
|
||||||
|
title={Noria: dynamic, partially-stateful data-flow for high-performance web applications.},
|
||||||
|
author={Gjengset, Jon and Schwarzkopf, Malte and Behrens, Jonathan and Ara{\'u}jo, Lara Timb{\'o} and Ek, Martin and Kohler, Eddie and Kaashoek, M Frans and Morris, Robert Tappan},
|
||||||
|
booktitle={OSDI},
|
||||||
|
volume={18},
|
||||||
|
pages={213--231},
|
||||||
|
year={2018}
|
||||||
|
}
|
||||||
|
|
||||||
|
How storage works in database systems, and the evolution of how data is stored
|
||||||
|
@article{stonebraker2005goes,
|
||||||
|
title={What goes around comes around},
|
||||||
|
author={Stonebraker, Michael and Hellerstein, Joey},
|
||||||
|
journal={Readings in database systems},
|
||||||
|
volume={4},
|
||||||
|
pages={1},
|
||||||
|
year={2005}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{vohra2016apache,
|
||||||
|
title={Apache parquet},
|
||||||
|
author={Vohra, Deepak and Vohra, Deepak},
|
||||||
|
journal={Practical Hadoop Ecosystem: A Definitive Guide to Hadoop-Related Frameworks and Tools},
|
||||||
|
pages={325--335},
|
||||||
|
year={2016},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{nishtala2013scaling,
|
||||||
|
title={Scaling memcache at facebook},
|
||||||
|
author={Nishtala, Rajesh and Fugal, Hans and Grimm, Steven and Kwiatkowski, Marc and Lee, Herman and Li, Harry C and McElroy, Ryan and Paleczny, Mike and Peek, Daniel and Saab, Paul and others},
|
||||||
|
booktitle={Presented as part of the 10th $\{$USENIX$\}$ Symposium on Networked Systems Design and Implementation ($\{$NSDI$\}$ 13)},
|
||||||
|
pages={385--398},
|
||||||
|
year={2013}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{zhou2021foundationdb,
|
||||||
|
title={Foundationdb: A distributed unbundled transactional key value store},
|
||||||
|
author={Zhou, Jingyu and Xu, Meng and Shraer, Alexander and Namasivayam, Bala and Miller, Alex and Tschannen, Evan and Atherton, Steve and Beamon, Andrew J and Sears, Rusty and Leach, John and others},
|
||||||
|
booktitle={Proceedings of the 2021 International Conference on Management of Data},
|
||||||
|
pages={2653--2666},
|
||||||
|
year={2021}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{lakshman2010cassandra,
|
||||||
|
title={Cassandra: a decentralized structured storage system},
|
||||||
|
author={Lakshman, Avinash and Malik, Prashant},
|
||||||
|
journal={ACM SIGOPS operating systems review},
|
||||||
|
volume={44},
|
||||||
|
number={2},
|
||||||
|
pages={35--40},
|
||||||
|
year={2010},
|
||||||
|
publisher={ACM New York, NY, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{veloren32,
|
||||||
|
title = "This Week In Veloren 32",
|
||||||
|
author = "AngelOnFira",
|
||||||
|
month = "September",
|
||||||
|
year = "2019",
|
||||||
|
url = "https://veloren.net/blog/devblog-32/"
|
||||||
|
}
|
53
paper/unity-file.drawio
Normal file
53
paper/unity-file.drawio
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
<mxfile host="Electron" modified="2023-12-14T09:51:26.683Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/22.0.2 Chrome/114.0.5735.289 Electron/25.8.4 Safari/537.36" etag="iOiW5F6x8VUFkmnMflTj" version="22.0.2" type="device">
|
||||||
|
<diagram name="Page-1" id="TafIrdbnw2cWi4bqOyK2">
|
||||||
|
<mxGraphModel dx="1114" dy="999" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
|
||||||
|
<root>
|
||||||
|
<mxCell id="0" />
|
||||||
|
<mxCell id="1" parent="0" />
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="40" y="20" width="120" height="200" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-2" value="Chunk 1" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="50" y="50" width="100" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-3" value="Chunk 2" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="50" y="100" width="100" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-6" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" edge="1" parent="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="100" y="210" as="sourcePoint" />
|
||||||
|
<mxPoint x="100" y="150" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-7" value="Metadata" style="swimlane;fontStyle=0;childLayout=stackLayout;horizontal=1;startSize=30;horizontalStack=0;resizeParent=1;resizeParentMax=0;resizeLast=0;collapsible=1;marginBottom=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="230" y="40" width="140" height="90" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-8" value="Start: 0, Size: 2" style="text;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;spacingLeft=4;spacingRight=4;overflow=hidden;points=[[0,0.5],[1,0.5]];portConstraint=eastwest;rotatable=0;whiteSpace=wrap;html=1;" vertex="1" parent="f65CT_Lw4DzFi_7RwwvQ-7">
|
||||||
|
<mxGeometry y="30" width="140" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-9" value="Start: 2, Size 3" style="text;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;spacingLeft=4;spacingRight=4;overflow=hidden;points=[[0,0.5],[1,0.5]];portConstraint=eastwest;rotatable=0;whiteSpace=wrap;html=1;" vertex="1" parent="f65CT_Lw4DzFi_7RwwvQ-7">
|
||||||
|
<mxGeometry y="60" width="140" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="f65CT_Lw4DzFi_7RwwvQ-8" target="f65CT_Lw4DzFi_7RwwvQ-2">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="190" y="85" />
|
||||||
|
<mxPoint x="190" y="50" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="f65CT_Lw4DzFi_7RwwvQ-9" target="f65CT_Lw4DzFi_7RwwvQ-3">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="190" y="115" />
|
||||||
|
<mxPoint x="190" y="100" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-14" value="Unity File" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="70" y="20" width="60" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
</root>
|
||||||
|
</mxGraphModel>
|
||||||
|
</diagram>
|
||||||
|
</mxfile>
|
BIN
paper/unity-file.drawio.png
Normal file
BIN
paper/unity-file.drawio.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 17 KiB |
@ -1,8 +1,8 @@
|
|||||||
package server
|
package server
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/storage"
|
"github.com/NickyBoy89/spatial-db/storage"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -6,8 +6,8 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/storage"
|
"github.com/NickyBoy89/spatial-db/storage"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
type InMemoryServer struct {
|
type InMemoryServer struct {
|
||||||
|
@ -7,8 +7,8 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/storage"
|
"github.com/NickyBoy89/spatial-db/storage"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
const fileCacheSize = 8
|
const fileCacheSize = 8
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ReadChunkFromFile(chunkFile *os.File) (world.ChunkData, error) {
|
func ReadChunkFromFile(chunkFile *os.File) (world.ChunkData, error) {
|
||||||
|
@ -3,7 +3,7 @@ package storage
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
type StorageServer interface {
|
type StorageServer interface {
|
||||||
|
@ -7,7 +7,7 @@ import (
|
|||||||
"io/fs"
|
"io/fs"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
// A `UnityFile` is a collection of chunks, stored as a single file on disk
|
// A `UnityFile` is a collection of chunks, stored as a single file on disk
|
||||||
|
@ -7,7 +7,7 @@ import (
|
|||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestCreateUnityFile(t *testing.T) {
|
func TestCreateUnityFile(t *testing.T) {
|
||||||
|
@ -4,8 +4,8 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/storage"
|
"github.com/NickyBoy89/spatial-db/storage"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
)
|
)
|
||||||
|
|
||||||
func readBlockTemplate(
|
func readBlockTemplate(
|
||||||
|
@ -4,9 +4,9 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/server"
|
"github.com/NickyBoy89/spatial-db/server"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/storage"
|
"github.com/NickyBoy89/spatial-db/storage"
|
||||||
"git.nicholasnovak.io/nnovak/spatial-db/world"
|
"github.com/NickyBoy89/spatial-db/world"
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
tea "github.com/charmbracelet/bubbletea"
|
||||||
"github.com/charmbracelet/lipgloss"
|
"github.com/charmbracelet/lipgloss"
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
Loading…
Reference in New Issue
Block a user